0.11.1-alt1

- 0.11.1
This commit is contained in:
Сергей Конев 2024-10-16 14:15:33 +03:00
parent 0e2054b322
commit 576b4e53aa
18959 changed files with 5148223 additions and 1 deletions

View File

@ -1 +1,2 @@
tar: v@version@:.
tar: v@version@:.
tar: vendor name=@name@-@version@-vendor base=vendor/

View File

@ -10,6 +10,7 @@ Url: https://github.com/typst/typst
Packager: Sergey Konev <konevsa@altlinux.org>
Source: %name-%version.tar
Source1: %name-%version-vendor.tar
BuildRequires(pre): rpm-build-rust
BuildRequires: /proc

1
vendor/adler/.cargo-checksum.json vendored Normal file
View File

@ -0,0 +1 @@
{"files":{"CHANGELOG.md":"737088e45fdf27fe2cfedce163332d8ce08c58fd86ca287de2de34c0fbaf63e7","Cargo.toml":"f410869f0f1a5697f65a8a77be03da7aeecc0be26e7cf3a1feb1acaa4f518770","LICENSE-0BSD":"861399f8c21c042b110517e76dc6b63a2b334276c8cf17412fc3c8908ca8dc17","LICENSE-APACHE":"8ada45cd9f843acf64e4722ae262c622a2b3b3007c7310ef36ac1061a30f6adb","LICENSE-MIT":"23f18e03dc49df91622fe2a76176497404e46ced8a715d9d2b67a7446571cca3","README.md":"308c50cdb42b9573743068158339570b45ca3f895015ca3b87ba983edb0a21e6","RELEASE_PROCESS.md":"a86cd10fc70f167f8d00e9e4ce0c6b4ebdfa1865058390dffd1e0ad4d3e68d9d","benches/bench.rs":"c07ce370e3680c602e415f8d1ec4e543ea2163ab22a09b6b82d93e8a30adca82","src/algo.rs":"b664b131f724a809591394a10b9023f40ab5963e32a83fa3163c2668e59c8b66","src/lib.rs":"b55ba9c629b30360d08168b2ca0c96275432856a539737a105a6d6ae6bf7e88f"},"package":"f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"}

63
vendor/adler/CHANGELOG.md vendored Normal file
View File

@ -0,0 +1,63 @@
# Changelog
## Unreleased
No changes.
## [1.0.2 - 2021-02-26](https://github.com/jonas-schievink/adler/releases/tag/v1.0.2)
- Fix doctest on big-endian systems ([#9]).
[#9]: https://github.com/jonas-schievink/adler/pull/9
## [1.0.1 - 2020-11-08](https://github.com/jonas-schievink/adler/releases/tag/v1.0.1)
### Fixes
- Fix documentation on docs.rs.
## [1.0.0 - 2020-11-08](https://github.com/jonas-schievink/adler/releases/tag/v1.0.0)
### Fixes
- Fix `cargo test --no-default-features` ([#5]).
### Improvements
- Extended and clarified documentation.
- Added more rustdoc examples.
- Extended CI to test the crate with `--no-default-features`.
### Breaking Changes
- `adler32_reader` now takes its generic argument by value instead of as a `&mut`.
- Renamed `adler32_reader` to `adler32`.
## [0.2.3 - 2020-07-11](https://github.com/jonas-schievink/adler/releases/tag/v0.2.3)
- Process 4 Bytes at a time, improving performance by up to 50% ([#2]).
## [0.2.2 - 2020-06-27](https://github.com/jonas-schievink/adler/releases/tag/v0.2.2)
- Bump MSRV to 1.31.0.
## [0.2.1 - 2020-06-27](https://github.com/jonas-schievink/adler/releases/tag/v0.2.1)
- Add a few `#[inline]` annotations to small functions.
- Fix CI badge.
- Allow integration into libstd.
## [0.2.0 - 2020-06-27](https://github.com/jonas-schievink/adler/releases/tag/v0.2.0)
- Support `#![no_std]` when using `default-features = false`.
- Improve performance by around 7x.
- Support Rust 1.8.0.
- Improve API naming.
## [0.1.0 - 2020-06-26](https://github.com/jonas-schievink/adler/releases/tag/v0.1.0)
Initial release.
[#2]: https://github.com/jonas-schievink/adler/pull/2
[#5]: https://github.com/jonas-schievink/adler/pull/5

64
vendor/adler/Cargo.toml vendored Normal file
View File

@ -0,0 +1,64 @@
# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO
#
# When uploading crates to the registry Cargo will automatically
# "normalize" Cargo.toml files for maximal compatibility
# with all versions of Cargo and also rewrite `path` dependencies
# to registry (e.g., crates.io) dependencies
#
# If you believe there's an error in this file please file an
# issue against the rust-lang/cargo repository. If you're
# editing this file be aware that the upstream Cargo.toml
# will likely look very different (and much more reasonable)
[package]
name = "adler"
version = "1.0.2"
authors = ["Jonas Schievink <jonasschievink@gmail.com>"]
description = "A simple clean-room implementation of the Adler-32 checksum"
documentation = "https://docs.rs/adler/"
readme = "README.md"
keywords = ["checksum", "integrity", "hash", "adler32", "zlib"]
categories = ["algorithms"]
license = "0BSD OR MIT OR Apache-2.0"
repository = "https://github.com/jonas-schievink/adler.git"
[package.metadata.docs.rs]
rustdoc-args = ["--cfg=docsrs"]
[package.metadata.release]
no-dev-version = true
pre-release-commit-message = "Release {{version}}"
tag-message = "{{version}}"
[[package.metadata.release.pre-release-replacements]]
file = "CHANGELOG.md"
replace = "## Unreleased\n\nNo changes.\n\n## [{{version}} - {{date}}](https://github.com/jonas-schievink/adler/releases/tag/v{{version}})\n"
search = "## Unreleased\n"
[[package.metadata.release.pre-release-replacements]]
file = "README.md"
replace = "adler = \"{{version}}\""
search = "adler = \"[a-z0-9\\\\.-]+\""
[[package.metadata.release.pre-release-replacements]]
file = "src/lib.rs"
replace = "https://docs.rs/adler/{{version}}"
search = "https://docs.rs/adler/[a-z0-9\\.-]+"
[[bench]]
name = "bench"
harness = false
[dependencies.compiler_builtins]
version = "0.1.2"
optional = true
[dependencies.core]
version = "1.0.0"
optional = true
package = "rustc-std-workspace-core"
[dev-dependencies.criterion]
version = "0.3.2"
[features]
default = ["std"]
rustc-dep-of-std = ["core", "compiler_builtins"]
std = []

12
vendor/adler/LICENSE-0BSD vendored Normal file
View File

@ -0,0 +1,12 @@
Copyright (C) Jonas Schievink <jonasschievink@gmail.com>
Permission to use, copy, modify, and/or distribute this software for
any purpose with or without fee is hereby granted.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.

201
vendor/adler/LICENSE-APACHE vendored Normal file
View File

@ -0,0 +1,201 @@
Apache License
Version 2.0, January 2004
https://www.apache.org/licenses/LICENSE-2.0
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
https://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

23
vendor/adler/LICENSE-MIT vendored Normal file
View File

@ -0,0 +1,23 @@
Permission is hereby granted, free of charge, to any
person obtaining a copy of this software and associated
documentation files (the "Software"), to deal in the
Software without restriction, including without
limitation the rights to use, copy, modify, merge,
publish, distribute, sublicense, and/or sell copies of
the Software, and to permit persons to whom the Software
is furnished to do so, subject to the following
conditions:
The above copyright notice and this permission notice
shall be included in all copies or substantial portions
of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.

39
vendor/adler/README.md vendored Normal file
View File

@ -0,0 +1,39 @@
# Adler-32 checksums for Rust
[![crates.io](https://img.shields.io/crates/v/adler.svg)](https://crates.io/crates/adler)
[![docs.rs](https://docs.rs/adler/badge.svg)](https://docs.rs/adler/)
![CI](https://github.com/jonas-schievink/adler/workflows/CI/badge.svg)
This crate provides a simple implementation of the Adler-32 checksum, used in
the zlib compression format.
Please refer to the [changelog](CHANGELOG.md) to see what changed in the last
releases.
## Features
- Permissively licensed (0BSD) clean-room implementation.
- Zero dependencies.
- Zero `unsafe`.
- Decent performance (3-4 GB/s).
- Supports `#![no_std]` (with `default-features = false`).
## Usage
Add an entry to your `Cargo.toml`:
```toml
[dependencies]
adler = "1.0.2"
```
Check the [API Documentation](https://docs.rs/adler/) for how to use the
crate's functionality.
## Rust version support
Currently, this crate supports all Rust versions starting at Rust 1.31.0.
Bumping the Minimum Supported Rust Version (MSRV) is *not* considered a breaking
change, but will not be done without good reasons. The latest 3 stable Rust
versions will always be supported no matter what.

13
vendor/adler/RELEASE_PROCESS.md vendored Normal file
View File

@ -0,0 +1,13 @@
# What to do to publish a new release
1. Ensure all notable changes are in the changelog under "Unreleased".
2. Execute `cargo release <level>` to bump version(s), tag and publish
everything. External subcommand, must be installed with `cargo install
cargo-release`.
`<level>` can be one of `major|minor|patch`. If this is the first release
(`0.1.0`), use `minor`, since the version starts out as `0.0.0`.
3. Go to the GitHub releases, edit the just-pushed tag. Copy the release notes
from the changelog.

109
vendor/adler/benches/bench.rs vendored Normal file
View File

@ -0,0 +1,109 @@
extern crate adler;
extern crate criterion;
use adler::{adler32_slice, Adler32};
use criterion::{criterion_group, criterion_main, Criterion, Throughput};
fn simple(c: &mut Criterion) {
{
const SIZE: usize = 100;
let mut group = c.benchmark_group("simple-100b");
group.throughput(Throughput::Bytes(SIZE as u64));
group.bench_function("zeroes-100", |bencher| {
bencher.iter(|| {
adler32_slice(&[0; SIZE]);
});
});
group.bench_function("ones-100", |bencher| {
bencher.iter(|| {
adler32_slice(&[0xff; SIZE]);
});
});
}
{
const SIZE: usize = 1024;
let mut group = c.benchmark_group("simple-1k");
group.throughput(Throughput::Bytes(SIZE as u64));
group.bench_function("zeroes-1k", |bencher| {
bencher.iter(|| {
adler32_slice(&[0; SIZE]);
});
});
group.bench_function("ones-1k", |bencher| {
bencher.iter(|| {
adler32_slice(&[0xff; SIZE]);
});
});
}
{
const SIZE: usize = 1024 * 1024;
let mut group = c.benchmark_group("simple-1m");
group.throughput(Throughput::Bytes(SIZE as u64));
group.bench_function("zeroes-1m", |bencher| {
bencher.iter(|| {
adler32_slice(&[0; SIZE]);
});
});
group.bench_function("ones-1m", |bencher| {
bencher.iter(|| {
adler32_slice(&[0xff; SIZE]);
});
});
}
}
fn chunked(c: &mut Criterion) {
const SIZE: usize = 16 * 1024 * 1024;
let data = vec![0xAB; SIZE];
let mut group = c.benchmark_group("chunked-16m");
group.throughput(Throughput::Bytes(SIZE as u64));
group.bench_function("5552", |bencher| {
bencher.iter(|| {
let mut h = Adler32::new();
for chunk in data.chunks(5552) {
h.write_slice(chunk);
}
h.checksum()
});
});
group.bench_function("8k", |bencher| {
bencher.iter(|| {
let mut h = Adler32::new();
for chunk in data.chunks(8 * 1024) {
h.write_slice(chunk);
}
h.checksum()
});
});
group.bench_function("64k", |bencher| {
bencher.iter(|| {
let mut h = Adler32::new();
for chunk in data.chunks(64 * 1024) {
h.write_slice(chunk);
}
h.checksum()
});
});
group.bench_function("1m", |bencher| {
bencher.iter(|| {
let mut h = Adler32::new();
for chunk in data.chunks(1024 * 1024) {
h.write_slice(chunk);
}
h.checksum()
});
});
}
criterion_group!(benches, simple, chunked);
criterion_main!(benches);

146
vendor/adler/src/algo.rs vendored Normal file
View File

@ -0,0 +1,146 @@
use crate::Adler32;
use std::ops::{AddAssign, MulAssign, RemAssign};
impl Adler32 {
pub(crate) fn compute(&mut self, bytes: &[u8]) {
// The basic algorithm is, for every byte:
// a = (a + byte) % MOD
// b = (b + a) % MOD
// where MOD = 65521.
//
// For efficiency, we can defer the `% MOD` operations as long as neither a nor b overflows:
// - Between calls to `write`, we ensure that a and b are always in range 0..MOD.
// - We use 32-bit arithmetic in this function.
// - Therefore, a and b must not increase by more than 2^32-MOD without performing a `% MOD`
// operation.
//
// According to Wikipedia, b is calculated as follows for non-incremental checksumming:
// b = n×D1 + (n1)×D2 + (n2)×D3 + ... + Dn + n*1 (mod 65521)
// Where n is the number of bytes and Di is the i-th Byte. We need to change this to account
// for the previous values of a and b, as well as treat every input Byte as being 255:
// b_inc = n×255 + (n-1)×255 + ... + 255 + n*65520
// Or in other words:
// b_inc = n*65520 + n(n+1)/2*255
// The max chunk size is thus the largest value of n so that b_inc <= 2^32-65521.
// 2^32-65521 = n*65520 + n(n+1)/2*255
// Plugging this into an equation solver since I can't math gives n = 5552.18..., so 5552.
//
// On top of the optimization outlined above, the algorithm can also be parallelized with a
// bit more work:
//
// Note that b is a linear combination of a vector of input bytes (D1, ..., Dn).
//
// If we fix some value k<N and rewrite indices 1, ..., N as
//
// 1_1, 1_2, ..., 1_k, 2_1, ..., 2_k, ..., (N/k)_k,
//
// then we can express a and b in terms of sums of smaller sequences kb and ka:
//
// ka(j) := D1_j + D2_j + ... + D(N/k)_j where j <= k
// kb(j) := (N/k)*D1_j + (N/k-1)*D2_j + ... + D(N/k)_j where j <= k
//
// a = ka(1) + ka(2) + ... + ka(k) + 1
// b = k*(kb(1) + kb(2) + ... + kb(k)) - 1*ka(2) - ... - (k-1)*ka(k) + N
//
// We use this insight to unroll the main loop and process k=4 bytes at a time.
// The resulting code is highly amenable to SIMD acceleration, although the immediate speedups
// stem from increased pipeline parallelism rather than auto-vectorization.
//
// This technique is described in-depth (here:)[https://software.intel.com/content/www/us/\
// en/develop/articles/fast-computation-of-fletcher-checksums.html]
const MOD: u32 = 65521;
const CHUNK_SIZE: usize = 5552 * 4;
let mut a = u32::from(self.a);
let mut b = u32::from(self.b);
let mut a_vec = U32X4([0; 4]);
let mut b_vec = a_vec;
let (bytes, remainder) = bytes.split_at(bytes.len() - bytes.len() % 4);
// iterate over 4 bytes at a time
let chunk_iter = bytes.chunks_exact(CHUNK_SIZE);
let remainder_chunk = chunk_iter.remainder();
for chunk in chunk_iter {
for byte_vec in chunk.chunks_exact(4) {
let val = U32X4::from(byte_vec);
a_vec += val;
b_vec += a_vec;
}
b += CHUNK_SIZE as u32 * a;
a_vec %= MOD;
b_vec %= MOD;
b %= MOD;
}
// special-case the final chunk because it may be shorter than the rest
for byte_vec in remainder_chunk.chunks_exact(4) {
let val = U32X4::from(byte_vec);
a_vec += val;
b_vec += a_vec;
}
b += remainder_chunk.len() as u32 * a;
a_vec %= MOD;
b_vec %= MOD;
b %= MOD;
// combine the sub-sum results into the main sum
b_vec *= 4;
b_vec.0[1] += MOD - a_vec.0[1];
b_vec.0[2] += (MOD - a_vec.0[2]) * 2;
b_vec.0[3] += (MOD - a_vec.0[3]) * 3;
for &av in a_vec.0.iter() {
a += av;
}
for &bv in b_vec.0.iter() {
b += bv;
}
// iterate over the remaining few bytes in serial
for &byte in remainder.iter() {
a += u32::from(byte);
b += a;
}
self.a = (a % MOD) as u16;
self.b = (b % MOD) as u16;
}
}
#[derive(Copy, Clone)]
struct U32X4([u32; 4]);
impl U32X4 {
fn from(bytes: &[u8]) -> Self {
U32X4([
u32::from(bytes[0]),
u32::from(bytes[1]),
u32::from(bytes[2]),
u32::from(bytes[3]),
])
}
}
impl AddAssign<Self> for U32X4 {
fn add_assign(&mut self, other: Self) {
for (s, o) in self.0.iter_mut().zip(other.0.iter()) {
*s += o;
}
}
}
impl RemAssign<u32> for U32X4 {
fn rem_assign(&mut self, quotient: u32) {
for s in self.0.iter_mut() {
*s %= quotient;
}
}
}
impl MulAssign<u32> for U32X4 {
fn mul_assign(&mut self, rhs: u32) {
for s in self.0.iter_mut() {
*s *= rhs;
}
}
}

287
vendor/adler/src/lib.rs vendored Normal file
View File

@ -0,0 +1,287 @@
//! Adler-32 checksum implementation.
//!
//! This implementation features:
//!
//! - Permissively licensed (0BSD) clean-room implementation.
//! - Zero dependencies.
//! - Zero `unsafe`.
//! - Decent performance (3-4 GB/s).
//! - `#![no_std]` support (with `default-features = false`).
#![doc(html_root_url = "https://docs.rs/adler/1.0.2")]
// Deny a few warnings in doctests, since rustdoc `allow`s many warnings by default
#![doc(test(attr(deny(unused_imports, unused_must_use))))]
#![cfg_attr(docsrs, feature(doc_cfg))]
#![warn(missing_debug_implementations)]
#![forbid(unsafe_code)]
#![cfg_attr(not(feature = "std"), no_std)]
#[cfg(not(feature = "std"))]
extern crate core as std;
mod algo;
use std::hash::Hasher;
#[cfg(feature = "std")]
use std::io::{self, BufRead};
/// Adler-32 checksum calculator.
///
/// An instance of this type is equivalent to an Adler-32 checksum: It can be created in the default
/// state via [`new`] (or the provided `Default` impl), or from a precalculated checksum via
/// [`from_checksum`], and the currently stored checksum can be fetched via [`checksum`].
///
/// This type also implements `Hasher`, which makes it easy to calculate Adler-32 checksums of any
/// type that implements or derives `Hash`. This also allows using Adler-32 in a `HashMap`, although
/// that is not recommended (while every checksum is a hash function, they are not necessarily a
/// good one).
///
/// # Examples
///
/// Basic, piecewise checksum calculation:
///
/// ```
/// use adler::Adler32;
///
/// let mut adler = Adler32::new();
///
/// adler.write_slice(&[0, 1, 2]);
/// adler.write_slice(&[3, 4, 5]);
///
/// assert_eq!(adler.checksum(), 0x00290010);
/// ```
///
/// Using `Hash` to process structures:
///
/// ```
/// use std::hash::Hash;
/// use adler::Adler32;
///
/// #[derive(Hash)]
/// struct Data {
/// byte: u8,
/// word: u16,
/// big: u64,
/// }
///
/// let mut adler = Adler32::new();
///
/// let data = Data { byte: 0x1F, word: 0xABCD, big: !0 };
/// data.hash(&mut adler);
///
/// // hash value depends on architecture endianness
/// if cfg!(target_endian = "little") {
/// assert_eq!(adler.checksum(), 0x33410990);
/// }
/// if cfg!(target_endian = "big") {
/// assert_eq!(adler.checksum(), 0x331F0990);
/// }
///
/// ```
///
/// [`new`]: #method.new
/// [`from_checksum`]: #method.from_checksum
/// [`checksum`]: #method.checksum
#[derive(Debug, Copy, Clone)]
pub struct Adler32 {
a: u16,
b: u16,
}
impl Adler32 {
/// Creates a new Adler-32 instance with default state.
#[inline]
pub fn new() -> Self {
Self::default()
}
/// Creates an `Adler32` instance from a precomputed Adler-32 checksum.
///
/// This allows resuming checksum calculation without having to keep the `Adler32` instance
/// around.
///
/// # Example
///
/// ```
/// # use adler::Adler32;
/// let parts = [
/// "rust",
/// "acean",
/// ];
/// let whole = adler::adler32_slice(b"rustacean");
///
/// let mut sum = Adler32::new();
/// sum.write_slice(parts[0].as_bytes());
/// let partial = sum.checksum();
///
/// // ...later
///
/// let mut sum = Adler32::from_checksum(partial);
/// sum.write_slice(parts[1].as_bytes());
/// assert_eq!(sum.checksum(), whole);
/// ```
#[inline]
pub fn from_checksum(sum: u32) -> Self {
Adler32 {
a: sum as u16,
b: (sum >> 16) as u16,
}
}
/// Returns the calculated checksum at this point in time.
#[inline]
pub fn checksum(&self) -> u32 {
(u32::from(self.b) << 16) | u32::from(self.a)
}
/// Adds `bytes` to the checksum calculation.
///
/// If efficiency matters, this should be called with Byte slices that contain at least a few
/// thousand Bytes.
pub fn write_slice(&mut self, bytes: &[u8]) {
self.compute(bytes);
}
}
impl Default for Adler32 {
#[inline]
fn default() -> Self {
Adler32 { a: 1, b: 0 }
}
}
impl Hasher for Adler32 {
#[inline]
fn finish(&self) -> u64 {
u64::from(self.checksum())
}
fn write(&mut self, bytes: &[u8]) {
self.write_slice(bytes);
}
}
/// Calculates the Adler-32 checksum of a byte slice.
///
/// This is a convenience function around the [`Adler32`] type.
///
/// [`Adler32`]: struct.Adler32.html
pub fn adler32_slice(data: &[u8]) -> u32 {
let mut h = Adler32::new();
h.write_slice(data);
h.checksum()
}
/// Calculates the Adler-32 checksum of a `BufRead`'s contents.
///
/// The passed `BufRead` implementor will be read until it reaches EOF (or until it reports an
/// error).
///
/// If you only have a `Read` implementor, you can wrap it in `std::io::BufReader` before calling
/// this function.
///
/// # Errors
///
/// Any error returned by the reader are bubbled up by this function.
///
/// # Examples
///
/// ```no_run
/// # fn run() -> Result<(), Box<dyn std::error::Error>> {
/// use adler::adler32;
///
/// use std::fs::File;
/// use std::io::BufReader;
///
/// let file = File::open("input.txt")?;
/// let mut file = BufReader::new(file);
///
/// adler32(&mut file)?;
/// # Ok(()) }
/// # fn main() { run().unwrap() }
/// ```
#[cfg(feature = "std")]
#[cfg_attr(docsrs, doc(cfg(feature = "std")))]
pub fn adler32<R: BufRead>(mut reader: R) -> io::Result<u32> {
let mut h = Adler32::new();
loop {
let len = {
let buf = reader.fill_buf()?;
if buf.is_empty() {
return Ok(h.checksum());
}
h.write_slice(buf);
buf.len()
};
reader.consume(len);
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn zeroes() {
assert_eq!(adler32_slice(&[]), 1);
assert_eq!(adler32_slice(&[0]), 1 | 1 << 16);
assert_eq!(adler32_slice(&[0, 0]), 1 | 2 << 16);
assert_eq!(adler32_slice(&[0; 100]), 0x00640001);
assert_eq!(adler32_slice(&[0; 1024]), 0x04000001);
assert_eq!(adler32_slice(&[0; 1024 * 1024]), 0x00f00001);
}
#[test]
fn ones() {
assert_eq!(adler32_slice(&[0xff; 1024]), 0x79a6fc2e);
assert_eq!(adler32_slice(&[0xff; 1024 * 1024]), 0x8e88ef11);
}
#[test]
fn mixed() {
assert_eq!(adler32_slice(&[1]), 2 | 2 << 16);
assert_eq!(adler32_slice(&[40]), 41 | 41 << 16);
assert_eq!(adler32_slice(&[0xA5; 1024 * 1024]), 0xd5009ab1);
}
/// Example calculation from https://en.wikipedia.org/wiki/Adler-32.
#[test]
fn wiki() {
assert_eq!(adler32_slice(b"Wikipedia"), 0x11E60398);
}
#[test]
fn resume() {
let mut adler = Adler32::new();
adler.write_slice(&[0xff; 1024]);
let partial = adler.checksum();
assert_eq!(partial, 0x79a6fc2e); // from above
adler.write_slice(&[0xff; 1024 * 1024 - 1024]);
assert_eq!(adler.checksum(), 0x8e88ef11); // from above
// Make sure that we can resume computing from the partial checksum via `from_checksum`.
let mut adler = Adler32::from_checksum(partial);
adler.write_slice(&[0xff; 1024 * 1024 - 1024]);
assert_eq!(adler.checksum(), 0x8e88ef11); // from above
}
#[cfg(feature = "std")]
#[test]
fn bufread() {
use std::io::BufReader;
fn test(data: &[u8], checksum: u32) {
// `BufReader` uses an 8 KB buffer, so this will test buffer refilling.
let mut buf = BufReader::new(data);
let real_sum = adler32(&mut buf).unwrap();
assert_eq!(checksum, real_sum);
}
test(&[], 1);
test(&[0; 1024], 0x04000001);
test(&[0; 1024 * 1024], 0x00f00001);
test(&[0xA5; 1024 * 1024], 0xd5009ab1);
}
}

View File

@ -0,0 +1 @@
{"files":{"COPYING":"01c266bced4a434da0051174d6bee16a4c82cf634e2679b6155d40d75012390f","Cargo.toml":"747d0fcb1257c9b8b013104da3c5a67f5d6cf8a95a2163b13703c01cab2c010a","DESIGN.md":"59c960e1b73b1d7fb41e4df6c0c1b1fcf44dd2ebc8a349597a7d0595f8cb5130","LICENSE-MIT":"0f96a83840e146e43c0ec96a22ec1f392e0680e6c1226e6f3ba87e0740af850f","README.md":"afc4d559a98cf190029af0bf320fc0022725e349cd2a303aac860254e28f3c53","UNLICENSE":"7e12e5df4bae12cb21581ba157ced20e1986a0508dd10d0e8a4ab9a4cf94e85c","rustfmt.toml":"1ca600239a27401c4a43f363cf3f38183a212affc1f31bff3ae93234bbaec228","src/ahocorasick.rs":"c699c07df70be45c666e128509ad571a7649d2073e4ae16ac1efd6793c9c6890","src/automaton.rs":"22258a3e118672413119f8f543a9b912cce954e63524575c0ebfdf9011f9c2dd","src/dfa.rs":"bfef1a94c5e7410584b1beb4e857b40d1ae2031b881cbc06fb1300409bbd555f","src/lib.rs":"2a92d5c5e930f2d306508802e8a929135e1f41c9f5f8deda8f7eb98947179dd2","src/macros.rs":"c6c52ae05b24433cffaca7b78b3645d797862c5d5feffddf9f54909095ed6e05","src/nfa/contiguous.rs":"aeb6ee5fd80eea04decbc4b46aa27d1ab270b78d416a644da25b7934f009ee66","src/nfa/mod.rs":"ee7b3109774d14bbad5239c16bb980dd6b8185ec136d94fbaf2f0dc27d5ffa15","src/nfa/noncontiguous.rs":"de94f02b04efd8744fb096759a8897c22012b0e0ca3ace161fd87c71befefe04","src/packed/api.rs":"160d3b10823316f7b0924e13c3afd222c8a7db5c0a00432401f311ef27d6a1b7","src/packed/ext.rs":"66be06fde8558429da23a290584d4b9fae665bf64c2578db4fe5f5f3ee864869","src/packed/mod.rs":"0020cd6f07ba5c8955923a9516d7f758864260eda53a6b6f629131c45ddeec62","src/packed/pattern.rs":"1e3a289a730c141fc30b295811e372d046c6619c7fd670308299b889a06c7673","src/packed/rabinkarp.rs":"403146eb1d838a84601d171393542340513cd1ee7ff750f2372161dd47746586","src/packed/teddy/README.md":"3a43194b64e221543d885176aba3beb1224a927385a20eca842daf6b0ea2f342","src/packed/teddy/builder.rs":"720735ea6c7ff92b081426513e6e82feed24a922849297bb538d28f7b8129f81","src/packed/teddy/generic.rs":"ea252ab05b32cea7dd9d71e332071d243db7dd0362e049252a27e5881ba2bf39","src/packed/teddy/mod.rs":"17d741f7e2fb9dbac5ba7d1bd4542cf1e35e9f146ace728e23fe6bbed20028b2","src/packed/tests.rs":"8e2f56eb3890ed3876ecb47d3121996e416563127b6430110d7b516df3f83b4b","src/packed/vector.rs":"840065521cbd4701fa5b8b506d1537843d858c903f7cadf3c68749ea1780874b","src/tests.rs":"c68192ab97b6161d0d6ee96fefd80cc7d14e4486ddcd8d1f82b5c92432c24ed5","src/transducer.rs":"02daa33a5d6dac41dcfd67f51df7c0d4a91c5131c781fb54c4de3520c585a6e1","src/util/alphabet.rs":"6dc22658a38deddc0279892035b18870d4585069e35ba7c7e649a24509acfbcc","src/util/buffer.rs":"f9e37f662c46c6ecd734458dedbe76c3bb0e84a93b6b0117c0d4ad3042413891","src/util/byte_frequencies.rs":"2fb85b381c038c1e44ce94294531cdcd339dca48b1e61f41455666e802cbbc9e","src/util/debug.rs":"ab301ad59aa912529cb97233a54a05914dd3cb2ec43e6fec7334170b97ac5998","src/util/error.rs":"ecccd60e7406305023efcc6adcc826eeeb083ab8f7fbfe3d97469438cd4c4e5c","src/util/int.rs":"4ab6dbdba10027ddec2af63a9b28ce4eee30ded0daa5d8eb068b2b55542b6039","src/util/mod.rs":"7ab28d11323ecdbd982087f32eb8bceeee84f1a2583f3aae27039c36d58cf12c","src/util/prefilter.rs":"9fa4498f18bf70478b1996c1a013698b626d15f119aa81dbc536673c9f045718","src/util/primitives.rs":"f89f3fa1d8db4e37de9ca767c6d05e346404837cade6d063bba68972fafa610b","src/util/remapper.rs":"9f12d911583a325c11806eeceb46d0dfec863cfcfa241aed84d31af73da746e5","src/util/search.rs":"6af803e08b8b8c8a33db100623f1621b0d741616524ce40893d8316897f27ffe","src/util/special.rs":"7d2f9cb9dd9771f59816e829b2d96b1239996f32939ba98764e121696c52b146"},"package":"b2969dcb958b36655471fc61f7e416fa76033bdd4bfed0678d8fee1e2d07a1f0"}

3
vendor/aho-corasick/COPYING vendored Normal file
View File

@ -0,0 +1,3 @@
This project is dual-licensed under the Unlicense and MIT licenses.
You may use this code under the terms of either license.

74
vendor/aho-corasick/Cargo.toml vendored Normal file
View File

@ -0,0 +1,74 @@
# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO
#
# When uploading crates to the registry Cargo will automatically
# "normalize" Cargo.toml files for maximal compatibility
# with all versions of Cargo and also rewrite `path` dependencies
# to registry (e.g., crates.io) dependencies.
#
# If you are reading this file be aware that the original Cargo.toml
# will likely look very different (and much more reasonable).
# See Cargo.toml.orig for the original contents.
[package]
edition = "2021"
rust-version = "1.60.0"
name = "aho-corasick"
version = "1.1.2"
authors = ["Andrew Gallant <jamslam@gmail.com>"]
exclude = [
"/aho-corasick-debug",
"/benchmarks",
"/tmp",
]
autotests = false
description = "Fast multiple substring searching."
homepage = "https://github.com/BurntSushi/aho-corasick"
readme = "README.md"
keywords = [
"string",
"search",
"text",
"pattern",
"multi",
]
categories = ["text-processing"]
license = "Unlicense OR MIT"
repository = "https://github.com/BurntSushi/aho-corasick"
[package.metadata.docs.rs]
all-features = true
rustdoc-args = [
"--cfg",
"docsrs",
"--generate-link-to-definition",
]
[profile.bench]
debug = 2
[profile.release]
debug = 2
[lib]
name = "aho_corasick"
[dependencies.log]
version = "0.4.17"
optional = true
[dependencies.memchr]
version = "2.4.0"
optional = true
default-features = false
[dev-dependencies.doc-comment]
version = "0.3.3"
[features]
default = [
"std",
"perf-literal",
]
logging = ["dep:log"]
perf-literal = ["dep:memchr"]
std = ["memchr?/std"]

481
vendor/aho-corasick/DESIGN.md vendored Normal file
View File

@ -0,0 +1,481 @@
This document describes the internal design of this crate, which is an object
lesson in what happens when you take a fairly simple old algorithm like
Aho-Corasick and make it fast and production ready.
The target audience of this document is Rust programmers that have some
familiarity with string searching, however, one does not need to know the
Aho-Corasick algorithm in order to read this (it is explained below). One
should, however, know what a trie is. (If you don't, go read its Wikipedia
article.)
The center-piece of this crate is an implementation of Aho-Corasick. On its
own, Aho-Corasick isn't that complicated. The complex pieces come from the
different variants of Aho-Corasick implemented in this crate. Specifically,
they are:
* Aho-Corasick as a noncontiguous NFA. States have their transitions
represented sparsely, and each state puts its transitions in its own separate
allocation. Hence the same "noncontiguous."
* Aho-Corasick as a contiguous NFA. This NFA uses a single allocation to
represent the transitions of all states. That is, transitions are laid out
contiguously in memory. Moreover, states near the starting state are
represented densely, such that finding the next state ID takes a constant
number of instructions.
* Aho-Corasick as a DFA. In this case, all states are represented densely in
a transition table that uses one allocation.
* Supporting "standard" match semantics, along with its overlapping variant,
in addition to leftmost-first and leftmost-longest semantics. The "standard"
semantics are typically what you see in a textbook description of
Aho-Corasick. However, Aho-Corasick is also useful as an optimization in
regex engines, which often use leftmost-first or leftmost-longest semantics.
Thus, it is useful to implement those semantics here. The "standard" and
"leftmost" search algorithms are subtly different, and also require slightly
different construction algorithms.
* Support for ASCII case insensitive matching.
* Support for accelerating searches when the patterns all start with a small
number of fixed bytes. Or alternatively, when the patterns all contain a
small number of rare bytes. (Searching for these bytes uses SIMD vectorized
code courtesy of `memchr`.)
* Transparent support for alternative SIMD vectorized search routines for
smaller number of literals, such as the Teddy algorithm. We called these
"packed" search routines because they use SIMD. They can often be an order of
magnitude faster than just Aho-Corasick, but don't scale as well.
* Support for searching streams. This can reuse most of the underlying code,
but does require careful buffering support.
* Support for anchored searches, which permit efficient "is prefix" checks for
a large number of patterns.
When you combine all of this together along with trying to make everything as
fast as possible, what you end up with is enitrely too much code with too much
`unsafe`. Alas, I was not smart enough to figure out how to reduce it. Instead,
we will explain it.
# Basics
The fundamental problem this crate is trying to solve is to determine the
occurrences of possibly many patterns in a haystack. The naive way to solve
this is to look for a match for each pattern at each position in the haystack:
for i in 0..haystack.len():
for p in patterns.iter():
if haystack[i..].starts_with(p.bytes()):
return Match(p.id(), i, i + p.bytes().len())
Those four lines are effectively all this crate does. The problem with those
four lines is that they are very slow, especially when you're searching for a
large number of patterns.
While there are many different algorithms available to solve this, a popular
one is Aho-Corasick. It's a common solution because it's not too hard to
implement, scales quite well even when searching for thousands of patterns and
is generally pretty fast. Aho-Corasick does well here because, regardless of
the number of patterns you're searching for, it always visits each byte in the
haystack exactly once. This means, generally speaking, adding more patterns to
an Aho-Corasick automaton does not make it slower. (Strictly speaking, however,
this is not true, since a larger automaton will make less effective use of the
CPU's cache.)
Aho-Corasick can be succinctly described as a trie with state transitions
between some of the nodes that efficiently instruct the search algorithm to
try matching alternative keys in the trie. The trick is that these state
transitions are arranged such that each byte of input needs to be inspected
only once. These state transitions are typically called "failure transitions,"
because they instruct the searcher (the thing traversing the automaton while
reading from the haystack) what to do when a byte in the haystack does not
correspond to a valid transition in the current state of the trie.
More formally, a failure transition points to a state in the automaton that may
lead to a match whose prefix is a proper suffix of the path traversed through
the trie so far. (If no such proper suffix exists, then the failure transition
points back to the start state of the trie, effectively restarting the search.)
This is perhaps simpler to explain pictorally. For example, let's say we built
an Aho-Corasick automaton with the following patterns: 'abcd' and 'cef'. The
trie looks like this:
a - S1 - b - S2 - c - S3 - d - S4*
/
S0 - c - S5 - e - S6 - f - S7*
where states marked with a `*` are match states (meaning, the search algorithm
should stop and report a match to the caller).
So given this trie, it should be somewhat straight-forward to see how it can
be used to determine whether any particular haystack *starts* with either
`abcd` or `cef`. It's easy to express this in code:
fn has_prefix(trie: &Trie, haystack: &[u8]) -> bool {
let mut state_id = trie.start();
// If the empty pattern is in trie, then state_id is a match state.
if trie.is_match(state_id) {
return true;
}
for (i, &b) in haystack.iter().enumerate() {
state_id = match trie.next_state(state_id, b) {
Some(id) => id,
// If there was no transition for this state and byte, then we know
// the haystack does not start with one of the patterns in our trie.
None => return false,
};
if trie.is_match(state_id) {
return true;
}
}
false
}
And that's pretty much it. All we do is move through the trie starting with the
bytes at the beginning of the haystack. If we find ourselves in a position
where we can't move, or if we've looked through the entire haystack without
seeing a match state, then we know the haystack does not start with any of the
patterns in the trie.
The meat of the Aho-Corasick algorithm is in how we add failure transitions to
our trie to keep searching efficient. Specifically, it permits us to not only
check whether a haystack *starts* with any one of a number of patterns, but
rather, whether the haystack contains any of a number of patterns *anywhere* in
the haystack.
As mentioned before, failure transitions connect a proper suffix of the path
traversed through the trie before, with a path that leads to a match that has a
prefix corresponding to that proper suffix. So in our case, for patterns `abcd`
and `cef`, with a haystack `abcef`, we want to transition to state `S5` (from
the diagram above) from `S3` upon seeing that the byte following `c` is not
`d`. Namely, the proper suffix in this example is `c`, which is a prefix of
`cef`. So the modified diagram looks like this:
a - S1 - b - S2 - c - S3 - d - S4*
/ /
/ ----------------
/ /
S0 - c - S5 - e - S6 - f - S7*
One thing that isn't shown in this diagram is that *all* states have a failure
transition, but only `S3` has a *non-trivial* failure transition. That is, all
other states have a failure transition back to the start state. So if our
haystack was `abzabcd`, then the searcher would transition back to `S0` after
seeing `z`, which effectively restarts the search. (Because there is no pattern
in our trie that has a prefix of `bz` or `z`.)
The code for traversing this *automaton* or *finite state machine* (it is no
longer just a trie) is not that much different from the `has_prefix` code
above:
fn contains(fsm: &FiniteStateMachine, haystack: &[u8]) -> bool {
let mut state_id = fsm.start();
// If the empty pattern is in fsm, then state_id is a match state.
if fsm.is_match(state_id) {
return true;
}
for (i, &b) in haystack.iter().enumerate() {
// While the diagram above doesn't show this, we may wind up needing
// to follow multiple failure transitions before we land on a state
// in which we can advance. Therefore, when searching for the next
// state, we need to loop until we don't see a failure transition.
//
// This loop terminates because the start state has no empty
// transitions. Every transition from the start state either points to
// another state, or loops back to the start state.
loop {
match fsm.next_state(state_id, b) {
Some(id) => {
state_id = id;
break;
}
// Unlike our code above, if there was no transition for this
// state, then we don't quit. Instead, we look for this state's
// failure transition and follow that instead.
None => {
state_id = fsm.next_fail_state(state_id);
}
};
}
if fsm.is_match(state_id) {
return true;
}
}
false
}
Other than the complication around traversing failure transitions, this code
is still roughly "traverse the automaton with bytes from the haystack, and quit
when a match is seen."
And that concludes our section on the basics. While we didn't go deep into how
the automaton is built (see `src/nfa/noncontiguous.rs`, which has detailed
comments about that), the basic structure of Aho-Corasick should be reasonably
clear.
# NFAs and DFAs
There are generally two types of finite automata: non-deterministic finite
automata (NFA) and deterministic finite automata (DFA). The difference between
them is, principally, that an NFA can be in multiple states at once. This is
typically accomplished by things called _epsilon_ transitions, where one could
move to a new state without consuming any bytes from the input. (The other
mechanism by which NFAs can be in more than one state is where the same byte in
a particular state transitions to multiple distinct states.) In contrast, a DFA
can only ever be in one state at a time. A DFA has no epsilon transitions, and
for any given state, a byte transitions to at most one other state.
By this formulation, the Aho-Corasick automaton described in the previous
section is an NFA. This is because failure transitions are, effectively,
epsilon transitions. That is, whenever the automaton is in state `S`, it is
actually in the set of states that are reachable by recursively following
failure transitions from `S` until you reach the start state. (This means
that, for example, the start state is always active since the start state is
reachable via failure transitions from any state in the automaton.)
NFAs have a lot of nice properties. They tend to be easier to construct, and
also tend to use less memory. However, their primary downside is that they are
typically slower to execute a search with. For example, the code above showing
how to search with an Aho-Corasick automaton needs to potentially iterate
through many failure transitions for every byte of input. While this is a
fairly small amount of overhead, this can add up, especially if the automaton
has a lot of overlapping patterns with a lot of failure transitions.
A DFA's search code, by contrast, looks like this:
fn contains(dfa: &DFA, haystack: &[u8]) -> bool {
let mut state_id = dfa.start();
// If the empty pattern is in dfa, then state_id is a match state.
if dfa.is_match(state_id) {
return true;
}
for (i, &b) in haystack.iter().enumerate() {
// An Aho-Corasick DFA *never* has a missing state that requires
// failure transitions to be followed. One byte of input advances the
// automaton by one state. Always.
state_id = dfa.next_state(state_id, b);
if dfa.is_match(state_id) {
return true;
}
}
false
}
The search logic here is much simpler than for the NFA, and this tends to
translate into significant performance benefits as well, since there's a lot
less work being done for each byte in the haystack. How is this accomplished?
It's done by pre-following all failure transitions for all states for all bytes
in the alphabet, and then building a single state transition table. Building
this DFA can be much more costly than building the NFA, and use much more
memory, but the better performance can be worth it.
Users of this crate can actually choose between using one of two possible NFAs
(noncontiguous or contiguous) or a DFA. By default, a contiguous NFA is used,
in most circumstances, but if the number of patterns is small enough a DFA will
be used. A contiguous NFA is chosen because it uses orders of magnitude less
memory than a DFA, takes only a little longer to build than a noncontiguous
NFA and usually gets pretty close to the search speed of a DFA. (Callers can
override this automatic selection via the `AhoCorasickBuilder::start_kind`
configuration.)
# More DFA tricks
As described in the previous section, one of the downsides of using a DFA
is that it uses more memory and can take longer to build. One small way of
mitigating these concerns is to map the alphabet used by the automaton into
a smaller space. Typically, the alphabet of a DFA has 256 elements in it:
one element for each possible value that fits into a byte. However, in many
cases, one does not need the full alphabet. For example, if all patterns in an
Aho-Corasick automaton are ASCII letters, then this only uses up 52 distinct
bytes. As far as the automaton is concerned, the rest of the 204 bytes are
indistinguishable from one another: they will never disrciminate between a
match or a non-match. Therefore, in cases like that, the alphabet can be shrunk
to just 53 elements. One for each ASCII letter, and then another to serve as a
placeholder for every other unused byte.
In practice, this library doesn't quite compute the optimal set of equivalence
classes, but it's close enough in most cases. The key idea is that this then
allows the transition table for the DFA to be potentially much smaller. The
downside of doing this, however, is that since the transition table is defined
in terms of this smaller alphabet space, every byte in the haystack must be
re-mapped to this smaller space. This requires an additional 256-byte table.
In practice, this can lead to a small search time hit, but it can be difficult
to measure. Moreover, it can sometimes lead to faster search times for bigger
automata, since it could be difference between more parts of the automaton
staying in the CPU cache or not.
One other trick for DFAs employed by this crate is the notion of premultiplying
state identifiers. Specifically, the normal way to compute the next transition
in a DFA is via the following (assuming that the transition table is laid out
sequentially in memory, in row-major order, where the rows are states):
next_state_id = dfa.transitions[current_state_id * 256 + current_byte]
However, since the value `256` is a fixed constant, we can actually premultiply
the state identifiers in the table when we build the table initially. Then, the
next transition computation simply becomes:
next_state_id = dfa.transitions[current_state_id + current_byte]
This doesn't seem like much, but when this is being executed for every byte of
input that you're searching, saving that extra multiplication instruction can
add up.
The same optimization works even when equivalence classes are enabled, as
described above. The only difference is that the premultiplication is by the
total number of equivalence classes instead of 256.
There isn't much downside to premultiplying state identifiers, other than it
imposes a smaller limit on the total number of states in the DFA. Namely, with
premultiplied state identifiers, you run out of room in your state identifier
representation more rapidly than if the identifiers are just state indices.
Both equivalence classes and premultiplication are always enabled. There is a
`AhoCorasickBuilder::byte_classes` configuration, but disabling this just makes
it so there are always 256 equivalence classes, i.e., every class corresponds
to precisely one byte. When it's disabled, the equivalence class map itself is
still used. The purpose of disabling it is when one is debugging the underlying
automaton. It can be easier to comprehend when it uses actual byte values for
its transitions instead of equivalence classes.
# Match semantics
One of the more interesting things about this implementation of Aho-Corasick
that (as far as this author knows) separates it from other implementations, is
that it natively supports leftmost-first and leftmost-longest match semantics.
Briefly, match semantics refer to the decision procedure by which searching
will disambiguate matches when there are multiple to choose from:
* **standard** match semantics emits matches as soon as they are detected by
the automaton. This is typically equivalent to the textbook non-overlapping
formulation of Aho-Corasick.
* **leftmost-first** match semantics means that 1) the next match is the match
starting at the leftmost position and 2) among multiple matches starting at
the same leftmost position, the match corresponding to the pattern provided
first by the caller is reported.
* **leftmost-longest** is like leftmost-first, except when there are multiple
matches starting at the same leftmost position, the pattern corresponding to
the longest match is returned.
(The crate API documentation discusses these differences, with examples, in
more depth on the `MatchKind` type.)
The reason why supporting these match semantics is important is because it
gives the user more control over the match procedure. For example,
leftmost-first permits users to implement match priority by simply putting the
higher priority patterns first. Leftmost-longest, on the other hand, permits
finding the longest possible match, which might be useful when trying to find
words matching a dictionary. Additionally, regex engines often want to use
Aho-Corasick as an optimization when searching for an alternation of literals.
In order to preserve correct match semantics, regex engines typically can't use
the standard textbook definition directly, since regex engines will implement
either leftmost-first (Perl-like) or leftmost-longest (POSIX) match semantics.
Supporting leftmost semantics requires a couple key changes:
* Constructing the Aho-Corasick automaton changes a bit in both how the trie is
constructed and how failure transitions are found. Namely, only a subset
of the failure transitions are added. Specifically, only the failure
transitions that either do not occur after a match or do occur after a match
but preserve that match are kept. (More details on this can be found in
`src/nfa/noncontiguous.rs`.)
* The search algorithm changes slightly. Since we are looking for the leftmost
match, we cannot quit as soon as a match is detected. Instead, after a match
is detected, we must keep searching until either the end of the input or
until a dead state is seen. (Dead states are not used for standard match
semantics. Dead states mean that searching should stop after a match has been
found.)
Most other implementations of Aho-Corasick do support leftmost match semantics,
but they do it with more overhead at search time, or even worse, with a queue
of matches and sophisticated hijinks to disambiguate the matches. While our
construction algorithm becomes a bit more complicated, the correct match
semantics fall out from the structure of the automaton itself.
# Overlapping matches
One of the nice properties of an Aho-Corasick automaton is that it can report
all possible matches, even when they overlap with one another. In this mode,
the match semantics don't matter, since all possible matches are reported.
Overlapping searches work just like regular searches, except the state
identifier at which the previous search left off is carried over to the next
search, so that it can pick up where it left off. If there are additional
matches at that state, then they are reported before resuming the search.
Enabling leftmost-first or leftmost-longest match semantics causes the
automaton to use a subset of all failure transitions, which means that
overlapping searches cannot be used. Therefore, if leftmost match semantics are
used, attempting to do an overlapping search will return an error (or panic
when using the infallible APIs). Thus, to get overlapping searches, the caller
must use the default standard match semantics. This behavior was chosen because
there are only two alternatives, which were deemed worse:
* Compile two automatons internally, one for standard semantics and one for
the semantics requested by the caller (if not standard).
* Create a new type, distinct from the `AhoCorasick` type, which has different
capabilities based on the configuration options.
The first is untenable because of the amount of memory used by the automaton.
The second increases the complexity of the API too much by adding too many
types that do similar things. It is conceptually much simpler to keep all
searching isolated to a single type.
# Stream searching
Since Aho-Corasick is an automaton, it is possible to do partial searches on
partial parts of the haystack, and then resume that search on subsequent pieces
of the haystack. This is useful when the haystack you're trying to search is
not stored contiguously in memory, or if one does not want to read the entire
haystack into memory at once.
Currently, only standard semantics are supported for stream searching. This is
some of the more complicated code in this crate, and is something I would very
much like to improve. In particular, it currently has the restriction that it
must buffer at least enough of the haystack in memory in order to fit the
longest possible match. The difficulty in getting stream searching right is
that the implementation choices (such as the buffer size) often impact what the
API looks like and what it's allowed to do.
# Prefilters
In some cases, Aho-Corasick is not the fastest way to find matches containing
multiple patterns. Sometimes, the search can be accelerated using highly
optimized SIMD routines. For example, consider searching the following
patterns:
Sherlock
Moriarty
Watson
It is plausible that it would be much faster to quickly look for occurrences of
the leading bytes, `S`, `M` or `W`, before trying to start searching via the
automaton. Indeed, this is exactly what this crate will do.
When there are more than three distinct starting bytes, then this crate will
look for three distinct bytes occurring at any position in the patterns, while
preferring bytes that are heuristically determined to be rare over others. For
example:
Abuzz
Sanchez
Vasquez
Topaz
Waltz
Here, we have more than 3 distinct starting bytes, but all of the patterns
contain `z`, which is typically a rare byte. In this case, the prefilter will
scan for `z`, back up a bit, and then execute the Aho-Corasick automaton.
If all of that fails, then a packed multiple substring algorithm will be
attempted. Currently, the only algorithm available for this is Teddy, but more
may be added in the future. Teddy is unlike the above prefilters in that it
confirms its own matches, so when Teddy is active, it might not be necessary
for Aho-Corasick to run at all. However, the current Teddy implementation
only works in `x86_64` when SSSE3 or AVX2 are available or in `aarch64`
(using NEON), and moreover, only works _well_ when there are a small number
of patterns (say, less than 100). Teddy also requires the haystack to be of a
certain length (more than 16-34 bytes). When the haystack is shorter than that,
Rabin-Karp is used instead. (See `src/packed/rabinkarp.rs`.)
There is a more thorough description of Teddy at
[`src/packed/teddy/README.md`](src/packed/teddy/README.md).

21
vendor/aho-corasick/LICENSE-MIT vendored Normal file
View File

@ -0,0 +1,21 @@
The MIT License (MIT)
Copyright (c) 2015 Andrew Gallant
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.

174
vendor/aho-corasick/README.md vendored Normal file
View File

@ -0,0 +1,174 @@
aho-corasick
============
A library for finding occurrences of many patterns at once with SIMD
acceleration in some cases. This library provides multiple pattern
search principally through an implementation of the
[Aho-Corasick algorithm](https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm),
which builds a finite state machine for executing searches in linear time.
Features include case insensitive matching, overlapping matches, fast searching
via SIMD and optional full DFA construction and search & replace in streams.
[![Build status](https://github.com/BurntSushi/aho-corasick/workflows/ci/badge.svg)](https://github.com/BurntSushi/aho-corasick/actions)
[![crates.io](https://img.shields.io/crates/v/aho-corasick.svg)](https://crates.io/crates/aho-corasick)
Dual-licensed under MIT or the [UNLICENSE](https://unlicense.org/).
### Documentation
https://docs.rs/aho-corasick
### Usage
Run `cargo add aho-corasick` to automatically add this crate as a dependency
in your `Cargo.toml` file.
### Example: basic searching
This example shows how to search for occurrences of multiple patterns
simultaneously. Each match includes the pattern that matched along with the
byte offsets of the match.
```rust
use aho_corasick::{AhoCorasick, PatternID};
let patterns = &["apple", "maple", "Snapple"];
let haystack = "Nobody likes maple in their apple flavored Snapple.";
let ac = AhoCorasick::new(patterns).unwrap();
let mut matches = vec![];
for mat in ac.find_iter(haystack) {
matches.push((mat.pattern(), mat.start(), mat.end()));
}
assert_eq!(matches, vec![
(PatternID::must(1), 13, 18),
(PatternID::must(0), 28, 33),
(PatternID::must(2), 43, 50),
]);
```
### Example: ASCII case insensitivity
This is like the previous example, but matches `Snapple` case insensitively
using `AhoCorasickBuilder`:
```rust
use aho_corasick::{AhoCorasick, PatternID};
let patterns = &["apple", "maple", "snapple"];
let haystack = "Nobody likes maple in their apple flavored Snapple.";
let ac = AhoCorasick::builder()
.ascii_case_insensitive(true)
.build(patterns)
.unwrap();
let mut matches = vec![];
for mat in ac.find_iter(haystack) {
matches.push((mat.pattern(), mat.start(), mat.end()));
}
assert_eq!(matches, vec![
(PatternID::must(1), 13, 18),
(PatternID::must(0), 28, 33),
(PatternID::must(2), 43, 50),
]);
```
### Example: replacing matches in a stream
This example shows how to execute a search and replace on a stream without
loading the entire stream into memory first.
```rust,ignore
use aho_corasick::AhoCorasick;
let patterns = &["fox", "brown", "quick"];
let replace_with = &["sloth", "grey", "slow"];
// In a real example, these might be `std::fs::File`s instead. All you need to
// do is supply a pair of `std::io::Read` and `std::io::Write` implementations.
let rdr = "The quick brown fox.";
let mut wtr = vec![];
let ac = AhoCorasick::new(patterns).unwrap();
ac.stream_replace_all(rdr.as_bytes(), &mut wtr, replace_with)
.expect("stream_replace_all failed");
assert_eq!(b"The slow grey sloth.".to_vec(), wtr);
```
### Example: finding the leftmost first match
In the textbook description of Aho-Corasick, its formulation is typically
structured such that it reports all possible matches, even when they overlap
with another. In many cases, overlapping matches may not be desired, such as
the case of finding all successive non-overlapping matches like you might with
a standard regular expression.
Unfortunately the "obvious" way to modify the Aho-Corasick algorithm to do
this doesn't always work in the expected way, since it will report matches as
soon as they are seen. For example, consider matching the regex `Samwise|Sam`
against the text `Samwise`. Most regex engines (that are Perl-like, or
non-POSIX) will report `Samwise` as a match, but the standard Aho-Corasick
algorithm modified for reporting non-overlapping matches will report `Sam`.
A novel contribution of this library is the ability to change the match
semantics of Aho-Corasick (without additional search time overhead) such that
`Samwise` is reported instead. For example, here's the standard approach:
```rust
use aho_corasick::AhoCorasick;
let patterns = &["Samwise", "Sam"];
let haystack = "Samwise";
let ac = AhoCorasick::new(patterns).unwrap();
let mat = ac.find(haystack).expect("should have a match");
assert_eq!("Sam", &haystack[mat.start()..mat.end()]);
```
And now here's the leftmost-first version, which matches how a Perl-like
regex will work:
```rust
use aho_corasick::{AhoCorasick, MatchKind};
let patterns = &["Samwise", "Sam"];
let haystack = "Samwise";
let ac = AhoCorasick::builder()
.match_kind(MatchKind::LeftmostFirst)
.build(patterns)
.unwrap();
let mat = ac.find(haystack).expect("should have a match");
assert_eq!("Samwise", &haystack[mat.start()..mat.end()]);
```
In addition to leftmost-first semantics, this library also supports
leftmost-longest semantics, which match the POSIX behavior of a regular
expression alternation. See `MatchKind` in the docs for more details.
### Minimum Rust version policy
This crate's minimum supported `rustc` version is `1.60.0`.
The current policy is that the minimum Rust version required to use this crate
can be increased in minor version updates. For example, if `crate 1.0` requires
Rust 1.20.0, then `crate 1.0.z` for all values of `z` will also require Rust
1.20.0 or newer. However, `crate 1.y` for `y > 0` may require a newer minimum
version of Rust.
In general, this crate will be conservative with respect to the minimum
supported version of Rust.
### FFI bindings
* [G-Research/ahocorasick_rs](https://github.com/G-Research/ahocorasick_rs/)
is a Python wrapper for this library.
* [tmikus/ahocorasick_rs](https://github.com/tmikus/ahocorasick_rs) is a Go
wrapper for this library.

24
vendor/aho-corasick/UNLICENSE vendored Normal file
View File

@ -0,0 +1,24 @@
This is free and unencumbered software released into the public domain.
Anyone is free to copy, modify, publish, use, compile, sell, or
distribute this software, either in source code form or as a compiled
binary, for any purpose, commercial or non-commercial, and by any
means.
In jurisdictions that recognize copyright laws, the author or authors
of this software dedicate any and all copyright interest in the
software to the public domain. We make this dedication for the benefit
of the public at large and to the detriment of our heirs and
successors. We intend this dedication to be an overt act of
relinquishment in perpetuity of all present and future rights to this
software under copyright law.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
OTHER DEALINGS IN THE SOFTWARE.
For more information, please refer to <http://unlicense.org/>

2
vendor/aho-corasick/rustfmt.toml vendored Normal file
View File

@ -0,0 +1,2 @@
max_width = 79
use_small_heuristics = "max"

2789
vendor/aho-corasick/src/ahocorasick.rs vendored Normal file

File diff suppressed because it is too large Load Diff

1608
vendor/aho-corasick/src/automaton.rs vendored Normal file

File diff suppressed because it is too large Load Diff

835
vendor/aho-corasick/src/dfa.rs vendored Normal file
View File

@ -0,0 +1,835 @@
/*!
Provides direct access to a DFA implementation of Aho-Corasick.
This is a low-level API that generally only needs to be used in niche
circumstances. When possible, prefer using [`AhoCorasick`](crate::AhoCorasick)
instead of a DFA directly. Using an `DFA` directly is typically only necessary
when one needs access to the [`Automaton`] trait implementation.
*/
use alloc::{vec, vec::Vec};
use crate::{
automaton::Automaton,
nfa::noncontiguous,
util::{
alphabet::ByteClasses,
error::{BuildError, MatchError},
int::{Usize, U32},
prefilter::Prefilter,
primitives::{IteratorIndexExt, PatternID, SmallIndex, StateID},
search::{Anchored, MatchKind, StartKind},
special::Special,
},
};
/// A DFA implementation of Aho-Corasick.
///
/// When possible, prefer using [`AhoCorasick`](crate::AhoCorasick) instead of
/// this type directly. Using a `DFA` directly is typically only necessary when
/// one needs access to the [`Automaton`] trait implementation.
///
/// This DFA can only be built by first constructing a [`noncontiguous::NFA`].
/// Both [`DFA::new`] and [`Builder::build`] do this for you automatically, but
/// [`Builder::build_from_noncontiguous`] permits doing it explicitly.
///
/// A DFA provides the best possible search performance (in this crate) via two
/// mechanisms:
///
/// * All states use a dense representation for their transitions.
/// * All failure transitions are pre-computed such that they are never
/// explicitly handled at search time.
///
/// These two facts combined mean that every state transition is performed
/// using a constant number of instructions. However, this comes at
/// great cost. The memory usage of a DFA can be quite exorbitant.
/// It is potentially multiple orders of magnitude greater than a
/// [`contiguous::NFA`](crate::nfa::contiguous::NFA) for example. In exchange,
/// a DFA will typically have better search speed than a `contiguous::NFA`, but
/// not by orders of magnitude.
///
/// Unless you have a small number of patterns or memory usage is not a concern
/// and search performance is critical, a DFA is usually not the best choice.
///
/// Moreover, unlike the NFAs in this crate, it is costly for a DFA to
/// support for anchored and unanchored search configurations. Namely,
/// since failure transitions are pre-computed, supporting both anchored
/// and unanchored searches requires a duplication of the transition table,
/// making the memory usage of such a DFA ever bigger. (The NFAs in this crate
/// unconditionally support both anchored and unanchored searches because there
/// is essentially no added cost for doing so.) It is for this reason that
/// a DFA's support for anchored and unanchored searches can be configured
/// via [`Builder::start_kind`]. By default, a DFA only supports unanchored
/// searches.
///
/// # Example
///
/// This example shows how to build an `DFA` directly and use it to execute
/// [`Automaton::try_find`]:
///
/// ```
/// use aho_corasick::{
/// automaton::Automaton,
/// dfa::DFA,
/// Input, Match,
/// };
///
/// let patterns = &["b", "abc", "abcd"];
/// let haystack = "abcd";
///
/// let nfa = DFA::new(patterns).unwrap();
/// assert_eq!(
/// Some(Match::must(0, 1..2)),
/// nfa.try_find(&Input::new(haystack))?,
/// );
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
///
/// It is also possible to implement your own version of `try_find`. See the
/// [`Automaton`] documentation for an example.
#[derive(Clone)]
pub struct DFA {
/// The DFA transition table. IDs in this table are pre-multiplied. So
/// instead of the IDs being 0, 1, 2, 3, ..., they are 0*stride, 1*stride,
/// 2*stride, 3*stride, ...
trans: Vec<StateID>,
/// The matches for every match state in this DFA. This is first indexed by
/// state index (so that's `sid >> stride2`) and then by order in which the
/// matches are meant to occur.
matches: Vec<Vec<PatternID>>,
/// The amount of heap memory used, in bytes, by the inner Vecs of
/// 'matches'.
matches_memory_usage: usize,
/// The length of each pattern. This is used to compute the start offset
/// of a match.
pattern_lens: Vec<SmallIndex>,
/// A prefilter for accelerating searches, if one exists.
prefilter: Option<Prefilter>,
/// The match semantics built into this DFA.
match_kind: MatchKind,
/// The total number of states in this DFA.
state_len: usize,
/// The alphabet size, or total number of equivalence classes, for this
/// DFA. Note that the actual number of transitions in each state is
/// stride=2^stride2, where stride is the smallest power of 2 greater than
/// or equal to alphabet_len. We do things this way so that we can use
/// bitshifting to go from a state ID to an index into 'matches'.
alphabet_len: usize,
/// The exponent with a base 2, such that stride=2^stride2. Given a state
/// index 'i', its state identifier is 'i << stride2'. Given a state
/// identifier 'sid', its state index is 'sid >> stride2'.
stride2: usize,
/// The equivalence classes for this DFA. All transitions are defined on
/// equivalence classes and not on the 256 distinct byte values.
byte_classes: ByteClasses,
/// The length of the shortest pattern in this automaton.
min_pattern_len: usize,
/// The length of the longest pattern in this automaton.
max_pattern_len: usize,
/// The information required to deduce which states are "special" in this
/// DFA.
special: Special,
}
impl DFA {
/// Create a new Aho-Corasick DFA using the default configuration.
///
/// Use a [`Builder`] if you want to change the configuration.
pub fn new<I, P>(patterns: I) -> Result<DFA, BuildError>
where
I: IntoIterator<Item = P>,
P: AsRef<[u8]>,
{
DFA::builder().build(patterns)
}
/// A convenience method for returning a new Aho-Corasick DFA builder.
///
/// This usually permits one to just import the `DFA` type.
pub fn builder() -> Builder {
Builder::new()
}
}
impl DFA {
/// A sentinel state ID indicating that a search should stop once it has
/// entered this state. When a search stops, it returns a match if one has
/// been found, otherwise no match. A DFA always has an actual dead state
/// at this ID.
///
/// N.B. DFAs, unlike NFAs, do not have any notion of a FAIL state.
/// Namely, the whole point of a DFA is that the FAIL state is completely
/// compiled away. That is, DFA construction involves pre-computing the
/// failure transitions everywhere, such that failure transitions are no
/// longer used at search time. This, combined with its uniformly dense
/// representation, are the two most important factors in why it's faster
/// than the NFAs in this crate.
const DEAD: StateID = StateID::new_unchecked(0);
/// Adds the given pattern IDs as matches to the given state and also
/// records the added memory usage.
fn set_matches(
&mut self,
sid: StateID,
pids: impl Iterator<Item = PatternID>,
) {
let index = (sid.as_usize() >> self.stride2).checked_sub(2).unwrap();
let mut at_least_one = false;
for pid in pids {
self.matches[index].push(pid);
self.matches_memory_usage += PatternID::SIZE;
at_least_one = true;
}
assert!(at_least_one, "match state must have non-empty pids");
}
}
// SAFETY: 'start_state' always returns a valid state ID, 'next_state' always
// returns a valid state ID given a valid state ID. We otherwise claim that
// all other methods are correct as well.
unsafe impl Automaton for DFA {
#[inline(always)]
fn start_state(&self, anchored: Anchored) -> Result<StateID, MatchError> {
// Either of the start state IDs can be DEAD, in which case, support
// for that type of search is not provided by this DFA. Which start
// state IDs are inactive depends on the 'StartKind' configuration at
// DFA construction time.
match anchored {
Anchored::No => {
let start = self.special.start_unanchored_id;
if start == DFA::DEAD {
Err(MatchError::invalid_input_unanchored())
} else {
Ok(start)
}
}
Anchored::Yes => {
let start = self.special.start_anchored_id;
if start == DFA::DEAD {
Err(MatchError::invalid_input_anchored())
} else {
Ok(start)
}
}
}
}
#[inline(always)]
fn next_state(
&self,
_anchored: Anchored,
sid: StateID,
byte: u8,
) -> StateID {
let class = self.byte_classes.get(byte);
self.trans[(sid.as_u32() + u32::from(class)).as_usize()]
}
#[inline(always)]
fn is_special(&self, sid: StateID) -> bool {
sid <= self.special.max_special_id
}
#[inline(always)]
fn is_dead(&self, sid: StateID) -> bool {
sid == DFA::DEAD
}
#[inline(always)]
fn is_match(&self, sid: StateID) -> bool {
!self.is_dead(sid) && sid <= self.special.max_match_id
}
#[inline(always)]
fn is_start(&self, sid: StateID) -> bool {
sid == self.special.start_unanchored_id
|| sid == self.special.start_anchored_id
}
#[inline(always)]
fn match_kind(&self) -> MatchKind {
self.match_kind
}
#[inline(always)]
fn patterns_len(&self) -> usize {
self.pattern_lens.len()
}
#[inline(always)]
fn pattern_len(&self, pid: PatternID) -> usize {
self.pattern_lens[pid].as_usize()
}
#[inline(always)]
fn min_pattern_len(&self) -> usize {
self.min_pattern_len
}
#[inline(always)]
fn max_pattern_len(&self) -> usize {
self.max_pattern_len
}
#[inline(always)]
fn match_len(&self, sid: StateID) -> usize {
debug_assert!(self.is_match(sid));
let offset = (sid.as_usize() >> self.stride2) - 2;
self.matches[offset].len()
}
#[inline(always)]
fn match_pattern(&self, sid: StateID, index: usize) -> PatternID {
debug_assert!(self.is_match(sid));
let offset = (sid.as_usize() >> self.stride2) - 2;
self.matches[offset][index]
}
#[inline(always)]
fn memory_usage(&self) -> usize {
use core::mem::size_of;
(self.trans.len() * size_of::<u32>())
+ (self.matches.len() * size_of::<Vec<PatternID>>())
+ self.matches_memory_usage
+ (self.pattern_lens.len() * size_of::<SmallIndex>())
+ self.prefilter.as_ref().map_or(0, |p| p.memory_usage())
}
#[inline(always)]
fn prefilter(&self) -> Option<&Prefilter> {
self.prefilter.as_ref()
}
}
impl core::fmt::Debug for DFA {
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
use crate::{
automaton::{fmt_state_indicator, sparse_transitions},
util::debug::DebugByte,
};
writeln!(f, "dfa::DFA(")?;
for index in 0..self.state_len {
let sid = StateID::new_unchecked(index << self.stride2);
// While we do currently include the FAIL state in the transition
// table (to simplify construction), it is never actually used. It
// poses problems with the code below because it gets treated as
// a match state incidentally when it is, of course, not. So we
// special case it. The fail state is always the first state after
// the dead state.
//
// If the construction is changed to remove the fail state (it
// probably should be), then this special case should be updated.
if index == 1 {
writeln!(f, "F {:06}:", sid.as_usize())?;
continue;
}
fmt_state_indicator(f, self, sid)?;
write!(f, "{:06}: ", sid.as_usize())?;
let it = (0..self.byte_classes.alphabet_len()).map(|class| {
(class.as_u8(), self.trans[sid.as_usize() + class])
});
for (i, (start, end, next)) in sparse_transitions(it).enumerate() {
if i > 0 {
write!(f, ", ")?;
}
if start == end {
write!(
f,
"{:?} => {:?}",
DebugByte(start),
next.as_usize()
)?;
} else {
write!(
f,
"{:?}-{:?} => {:?}",
DebugByte(start),
DebugByte(end),
next.as_usize()
)?;
}
}
write!(f, "\n")?;
if self.is_match(sid) {
write!(f, " matches: ")?;
for i in 0..self.match_len(sid) {
if i > 0 {
write!(f, ", ")?;
}
let pid = self.match_pattern(sid, i);
write!(f, "{}", pid.as_usize())?;
}
write!(f, "\n")?;
}
}
writeln!(f, "match kind: {:?}", self.match_kind)?;
writeln!(f, "prefilter: {:?}", self.prefilter.is_some())?;
writeln!(f, "state length: {:?}", self.state_len)?;
writeln!(f, "pattern length: {:?}", self.patterns_len())?;
writeln!(f, "shortest pattern length: {:?}", self.min_pattern_len)?;
writeln!(f, "longest pattern length: {:?}", self.max_pattern_len)?;
writeln!(f, "alphabet length: {:?}", self.alphabet_len)?;
writeln!(f, "stride: {:?}", 1 << self.stride2)?;
writeln!(f, "byte classes: {:?}", self.byte_classes)?;
writeln!(f, "memory usage: {:?}", self.memory_usage())?;
writeln!(f, ")")?;
Ok(())
}
}
/// A builder for configuring an Aho-Corasick DFA.
///
/// This builder has a subset of the options available to a
/// [`AhoCorasickBuilder`](crate::AhoCorasickBuilder). Of the shared options,
/// their behavior is identical.
#[derive(Clone, Debug)]
pub struct Builder {
noncontiguous: noncontiguous::Builder,
start_kind: StartKind,
byte_classes: bool,
}
impl Default for Builder {
fn default() -> Builder {
Builder {
noncontiguous: noncontiguous::Builder::new(),
start_kind: StartKind::Unanchored,
byte_classes: true,
}
}
}
impl Builder {
/// Create a new builder for configuring an Aho-Corasick DFA.
pub fn new() -> Builder {
Builder::default()
}
/// Build an Aho-Corasick DFA from the given iterator of patterns.
///
/// A builder may be reused to create more DFAs.
pub fn build<I, P>(&self, patterns: I) -> Result<DFA, BuildError>
where
I: IntoIterator<Item = P>,
P: AsRef<[u8]>,
{
let nnfa = self.noncontiguous.build(patterns)?;
self.build_from_noncontiguous(&nnfa)
}
/// Build an Aho-Corasick DFA from the given noncontiguous NFA.
///
/// Note that when this method is used, only the `start_kind` and
/// `byte_classes` settings on this builder are respected. The other
/// settings only apply to the initial construction of the Aho-Corasick
/// automaton. Since using this method requires that initial construction
/// has already completed, all settings impacting only initial construction
/// are no longer relevant.
pub fn build_from_noncontiguous(
&self,
nnfa: &noncontiguous::NFA,
) -> Result<DFA, BuildError> {
debug!("building DFA");
let byte_classes = if self.byte_classes {
nnfa.byte_classes().clone()
} else {
ByteClasses::singletons()
};
let state_len = match self.start_kind {
StartKind::Unanchored | StartKind::Anchored => nnfa.states().len(),
StartKind::Both => {
// These unwraps are OK because we know that the number of
// NFA states is < StateID::LIMIT which is in turn less than
// i32::MAX. Thus, there is always room to multiply by 2.
// Finally, the number of states is always at least 4 in the
// NFA (DEAD, FAIL, START-UNANCHORED, START-ANCHORED), so the
// subtraction of 4 is okay.
//
// Note that we subtract 4 because the "anchored" part of
// the DFA duplicates the unanchored part (without failure
// transitions), but reuses the DEAD, FAIL and START states.
nnfa.states()
.len()
.checked_mul(2)
.unwrap()
.checked_sub(4)
.unwrap()
}
};
let trans_len =
match state_len.checked_shl(byte_classes.stride2().as_u32()) {
Some(trans_len) => trans_len,
None => {
return Err(BuildError::state_id_overflow(
StateID::MAX.as_u64(),
usize::MAX.as_u64(),
))
}
};
StateID::new(trans_len.checked_sub(byte_classes.stride()).unwrap())
.map_err(|e| {
BuildError::state_id_overflow(
StateID::MAX.as_u64(),
e.attempted(),
)
})?;
let num_match_states = match self.start_kind {
StartKind::Unanchored | StartKind::Anchored => {
nnfa.special().max_match_id.as_usize().checked_sub(1).unwrap()
}
StartKind::Both => nnfa
.special()
.max_match_id
.as_usize()
.checked_sub(1)
.unwrap()
.checked_mul(2)
.unwrap(),
};
let mut dfa = DFA {
trans: vec![DFA::DEAD; trans_len],
matches: vec![vec![]; num_match_states],
matches_memory_usage: 0,
pattern_lens: nnfa.pattern_lens_raw().to_vec(),
prefilter: nnfa.prefilter().map(|p| p.clone()),
match_kind: nnfa.match_kind(),
state_len,
alphabet_len: byte_classes.alphabet_len(),
stride2: byte_classes.stride2(),
byte_classes,
min_pattern_len: nnfa.min_pattern_len(),
max_pattern_len: nnfa.max_pattern_len(),
// The special state IDs are set later.
special: Special::zero(),
};
match self.start_kind {
StartKind::Both => {
self.finish_build_both_starts(nnfa, &mut dfa);
}
StartKind::Unanchored => {
self.finish_build_one_start(Anchored::No, nnfa, &mut dfa);
}
StartKind::Anchored => {
self.finish_build_one_start(Anchored::Yes, nnfa, &mut dfa)
}
}
debug!(
"DFA built, <states: {:?}, size: {:?}, \
alphabet len: {:?}, stride: {:?}>",
dfa.state_len,
dfa.memory_usage(),
dfa.byte_classes.alphabet_len(),
dfa.byte_classes.stride(),
);
// The vectors can grow ~twice as big during construction because a
// Vec amortizes growth. But here, let's shrink things back down to
// what we actually need since we're never going to add more to it.
dfa.trans.shrink_to_fit();
dfa.pattern_lens.shrink_to_fit();
dfa.matches.shrink_to_fit();
// TODO: We might also want to shrink each Vec inside of `dfa.matches`,
// or even better, convert it to one contiguous allocation. But I think
// I went with nested allocs for good reason (can't remember), so this
// may be tricky to do. I decided not to shrink them here because it
// might require a fair bit of work to do. It's unclear whether it's
// worth it.
Ok(dfa)
}
/// Finishes building a DFA for either unanchored or anchored searches,
/// but NOT both.
fn finish_build_one_start(
&self,
anchored: Anchored,
nnfa: &noncontiguous::NFA,
dfa: &mut DFA,
) {
// This function always succeeds because we check above that all of the
// states in the NFA can be mapped to DFA state IDs.
let stride2 = dfa.stride2;
let old2new = |oldsid: StateID| {
StateID::new_unchecked(oldsid.as_usize() << stride2)
};
for (oldsid, state) in nnfa.states().iter().with_state_ids() {
let newsid = old2new(oldsid);
if state.is_match() {
dfa.set_matches(newsid, nnfa.iter_matches(oldsid));
}
sparse_iter(
nnfa,
oldsid,
&dfa.byte_classes,
|byte, class, mut oldnextsid| {
if oldnextsid == noncontiguous::NFA::FAIL {
if anchored.is_anchored() {
oldnextsid = noncontiguous::NFA::DEAD;
} else if state.fail() == noncontiguous::NFA::DEAD {
// This is a special case that avoids following
// DEAD transitions in a non-contiguous NFA.
// Following these transitions is pretty slow
// because the non-contiguous NFA will always use
// a sparse representation for it (because the
// DEAD state is usually treated as a sentinel).
// The *vast* majority of failure states are DEAD
// states, so this winds up being pretty slow if
// we go through the non-contiguous NFA state
// transition logic. Instead, just do it ourselves.
oldnextsid = noncontiguous::NFA::DEAD;
} else {
oldnextsid = nnfa.next_state(
Anchored::No,
state.fail(),
byte,
);
}
}
dfa.trans[newsid.as_usize() + usize::from(class)] =
old2new(oldnextsid);
},
);
}
// Now that we've remapped all the IDs in our states, all that's left
// is remapping the special state IDs.
let old = nnfa.special();
let new = &mut dfa.special;
new.max_special_id = old2new(old.max_special_id);
new.max_match_id = old2new(old.max_match_id);
if anchored.is_anchored() {
new.start_unanchored_id = DFA::DEAD;
new.start_anchored_id = old2new(old.start_anchored_id);
} else {
new.start_unanchored_id = old2new(old.start_unanchored_id);
new.start_anchored_id = DFA::DEAD;
}
}
/// Finishes building a DFA that supports BOTH unanchored and anchored
/// searches. It works by inter-leaving unanchored states with anchored
/// states in the same transition table. This way, we avoid needing to
/// re-shuffle states afterward to ensure that our states still look like
/// DEAD, MATCH, ..., START-UNANCHORED, START-ANCHORED, NON-MATCH, ...
///
/// Honestly this is pretty inscrutable... Simplifications are most
/// welcome.
fn finish_build_both_starts(
&self,
nnfa: &noncontiguous::NFA,
dfa: &mut DFA,
) {
let stride2 = dfa.stride2;
let stride = 1 << stride2;
let mut remap_unanchored = vec![DFA::DEAD; nnfa.states().len()];
let mut remap_anchored = vec![DFA::DEAD; nnfa.states().len()];
let mut is_anchored = vec![false; dfa.state_len];
let mut newsid = DFA::DEAD;
let next_dfa_id =
|sid: StateID| StateID::new_unchecked(sid.as_usize() + stride);
for (oldsid, state) in nnfa.states().iter().with_state_ids() {
if oldsid == noncontiguous::NFA::DEAD
|| oldsid == noncontiguous::NFA::FAIL
{
remap_unanchored[oldsid] = newsid;
remap_anchored[oldsid] = newsid;
newsid = next_dfa_id(newsid);
} else if oldsid == nnfa.special().start_unanchored_id
|| oldsid == nnfa.special().start_anchored_id
{
if oldsid == nnfa.special().start_unanchored_id {
remap_unanchored[oldsid] = newsid;
remap_anchored[oldsid] = DFA::DEAD;
} else {
remap_unanchored[oldsid] = DFA::DEAD;
remap_anchored[oldsid] = newsid;
is_anchored[newsid.as_usize() >> stride2] = true;
}
if state.is_match() {
dfa.set_matches(newsid, nnfa.iter_matches(oldsid));
}
sparse_iter(
nnfa,
oldsid,
&dfa.byte_classes,
|_, class, oldnextsid| {
let class = usize::from(class);
if oldnextsid == noncontiguous::NFA::FAIL {
dfa.trans[newsid.as_usize() + class] = DFA::DEAD;
} else {
dfa.trans[newsid.as_usize() + class] = oldnextsid;
}
},
);
newsid = next_dfa_id(newsid);
} else {
let unewsid = newsid;
newsid = next_dfa_id(newsid);
let anewsid = newsid;
newsid = next_dfa_id(newsid);
remap_unanchored[oldsid] = unewsid;
remap_anchored[oldsid] = anewsid;
is_anchored[anewsid.as_usize() >> stride2] = true;
if state.is_match() {
dfa.set_matches(unewsid, nnfa.iter_matches(oldsid));
dfa.set_matches(anewsid, nnfa.iter_matches(oldsid));
}
sparse_iter(
nnfa,
oldsid,
&dfa.byte_classes,
|byte, class, oldnextsid| {
let class = usize::from(class);
if oldnextsid == noncontiguous::NFA::FAIL {
let oldnextsid =
if state.fail() == noncontiguous::NFA::DEAD {
noncontiguous::NFA::DEAD
} else {
nnfa.next_state(
Anchored::No,
state.fail(),
byte,
)
};
dfa.trans[unewsid.as_usize() + class] = oldnextsid;
} else {
dfa.trans[unewsid.as_usize() + class] = oldnextsid;
dfa.trans[anewsid.as_usize() + class] = oldnextsid;
}
},
);
}
}
for i in 0..dfa.state_len {
let sid = i << stride2;
if is_anchored[i] {
for next in dfa.trans[sid..][..stride].iter_mut() {
*next = remap_anchored[*next];
}
} else {
for next in dfa.trans[sid..][..stride].iter_mut() {
*next = remap_unanchored[*next];
}
}
}
// Now that we've remapped all the IDs in our states, all that's left
// is remapping the special state IDs.
let old = nnfa.special();
let new = &mut dfa.special;
new.max_special_id = remap_anchored[old.max_special_id];
new.max_match_id = remap_anchored[old.max_match_id];
new.start_unanchored_id = remap_unanchored[old.start_unanchored_id];
new.start_anchored_id = remap_anchored[old.start_anchored_id];
}
/// Set the desired match semantics.
///
/// This only applies when using [`Builder::build`] and not
/// [`Builder::build_from_noncontiguous`].
///
/// See
/// [`AhoCorasickBuilder::match_kind`](crate::AhoCorasickBuilder::match_kind)
/// for more documentation and examples.
pub fn match_kind(&mut self, kind: MatchKind) -> &mut Builder {
self.noncontiguous.match_kind(kind);
self
}
/// Enable ASCII-aware case insensitive matching.
///
/// This only applies when using [`Builder::build`] and not
/// [`Builder::build_from_noncontiguous`].
///
/// See
/// [`AhoCorasickBuilder::ascii_case_insensitive`](crate::AhoCorasickBuilder::ascii_case_insensitive)
/// for more documentation and examples.
pub fn ascii_case_insensitive(&mut self, yes: bool) -> &mut Builder {
self.noncontiguous.ascii_case_insensitive(yes);
self
}
/// Enable heuristic prefilter optimizations.
///
/// This only applies when using [`Builder::build`] and not
/// [`Builder::build_from_noncontiguous`].
///
/// See
/// [`AhoCorasickBuilder::prefilter`](crate::AhoCorasickBuilder::prefilter)
/// for more documentation and examples.
pub fn prefilter(&mut self, yes: bool) -> &mut Builder {
self.noncontiguous.prefilter(yes);
self
}
/// Sets the starting state configuration for the automaton.
///
/// See
/// [`AhoCorasickBuilder::start_kind`](crate::AhoCorasickBuilder::start_kind)
/// for more documentation and examples.
pub fn start_kind(&mut self, kind: StartKind) -> &mut Builder {
self.start_kind = kind;
self
}
/// A debug setting for whether to attempt to shrink the size of the
/// automaton's alphabet or not.
///
/// This should never be enabled unless you're debugging an automaton.
/// Namely, disabling byte classes makes transitions easier to reason
/// about, since they use the actual bytes instead of equivalence classes.
/// Disabling this confers no performance benefit at search time.
///
/// See
/// [`AhoCorasickBuilder::byte_classes`](crate::AhoCorasickBuilder::byte_classes)
/// for more documentation and examples.
pub fn byte_classes(&mut self, yes: bool) -> &mut Builder {
self.byte_classes = yes;
self
}
}
/// Iterate over all possible equivalence class transitions in this state.
/// The closure is called for all transitions with a distinct equivalence
/// class, even those not explicitly represented in this sparse state. For
/// any implicitly defined transitions, the given closure is called with
/// the fail state ID.
///
/// The closure is guaranteed to be called precisely
/// `byte_classes.alphabet_len()` times, once for every possible class in
/// ascending order.
fn sparse_iter<F: FnMut(u8, u8, StateID)>(
nnfa: &noncontiguous::NFA,
oldsid: StateID,
classes: &ByteClasses,
mut f: F,
) {
let mut prev_class = None;
let mut byte = 0usize;
for t in nnfa.iter_trans(oldsid) {
while byte < usize::from(t.byte()) {
let rep = byte.as_u8();
let class = classes.get(rep);
byte += 1;
if prev_class != Some(class) {
f(rep, class, noncontiguous::NFA::FAIL);
prev_class = Some(class);
}
}
let rep = t.byte();
let class = classes.get(rep);
byte += 1;
if prev_class != Some(class) {
f(rep, class, t.next());
prev_class = Some(class);
}
}
for b in byte..=255 {
let rep = b.as_u8();
let class = classes.get(rep);
if prev_class != Some(class) {
f(rep, class, noncontiguous::NFA::FAIL);
prev_class = Some(class);
}
}
}

326
vendor/aho-corasick/src/lib.rs vendored Normal file
View File

@ -0,0 +1,326 @@
/*!
A library for finding occurrences of many patterns at once. This library
provides multiple pattern search principally through an implementation of the
[Aho-Corasick algorithm](https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm),
which builds a fast finite state machine for executing searches in linear time.
Additionally, this library provides a number of configuration options for
building the automaton that permit controlling the space versus time trade
off. Other features include simple ASCII case insensitive matching, finding
overlapping matches, replacements, searching streams and even searching and
replacing text in streams.
Finally, unlike most other Aho-Corasick implementations, this one
supports enabling [leftmost-first](MatchKind::LeftmostFirst) or
[leftmost-longest](MatchKind::LeftmostLongest) match semantics, using a
(seemingly) novel alternative construction algorithm. For more details on what
match semantics means, see the [`MatchKind`] type.
# Overview
This section gives a brief overview of the primary types in this crate:
* [`AhoCorasick`] is the primary type and represents an Aho-Corasick automaton.
This is the type you use to execute searches.
* [`AhoCorasickBuilder`] can be used to build an Aho-Corasick automaton, and
supports configuring a number of options.
* [`Match`] represents a single match reported by an Aho-Corasick automaton.
Each match has two pieces of information: the pattern that matched and the
start and end byte offsets corresponding to the position in the haystack at
which it matched.
# Example: basic searching
This example shows how to search for occurrences of multiple patterns
simultaneously. Each match includes the pattern that matched along with the
byte offsets of the match.
```
use aho_corasick::{AhoCorasick, PatternID};
let patterns = &["apple", "maple", "Snapple"];
let haystack = "Nobody likes maple in their apple flavored Snapple.";
let ac = AhoCorasick::new(patterns).unwrap();
let mut matches = vec![];
for mat in ac.find_iter(haystack) {
matches.push((mat.pattern(), mat.start(), mat.end()));
}
assert_eq!(matches, vec![
(PatternID::must(1), 13, 18),
(PatternID::must(0), 28, 33),
(PatternID::must(2), 43, 50),
]);
```
# Example: case insensitivity
This is like the previous example, but matches `Snapple` case insensitively
using `AhoCorasickBuilder`:
```
use aho_corasick::{AhoCorasick, PatternID};
let patterns = &["apple", "maple", "snapple"];
let haystack = "Nobody likes maple in their apple flavored Snapple.";
let ac = AhoCorasick::builder()
.ascii_case_insensitive(true)
.build(patterns)
.unwrap();
let mut matches = vec![];
for mat in ac.find_iter(haystack) {
matches.push((mat.pattern(), mat.start(), mat.end()));
}
assert_eq!(matches, vec![
(PatternID::must(1), 13, 18),
(PatternID::must(0), 28, 33),
(PatternID::must(2), 43, 50),
]);
```
# Example: replacing matches in a stream
This example shows how to execute a search and replace on a stream without
loading the entire stream into memory first.
```
# #[cfg(feature = "std")] {
use aho_corasick::AhoCorasick;
# fn example() -> Result<(), std::io::Error> {
let patterns = &["fox", "brown", "quick"];
let replace_with = &["sloth", "grey", "slow"];
// In a real example, these might be `std::fs::File`s instead. All you need to
// do is supply a pair of `std::io::Read` and `std::io::Write` implementations.
let rdr = "The quick brown fox.";
let mut wtr = vec![];
let ac = AhoCorasick::new(patterns).unwrap();
ac.try_stream_replace_all(rdr.as_bytes(), &mut wtr, replace_with)?;
assert_eq!(b"The slow grey sloth.".to_vec(), wtr);
# Ok(()) }; example().unwrap()
# }
```
# Example: finding the leftmost first match
In the textbook description of Aho-Corasick, its formulation is typically
structured such that it reports all possible matches, even when they overlap
with another. In many cases, overlapping matches may not be desired, such as
the case of finding all successive non-overlapping matches like you might with
a standard regular expression.
Unfortunately the "obvious" way to modify the Aho-Corasick algorithm to do
this doesn't always work in the expected way, since it will report matches as
soon as they are seen. For example, consider matching the regex `Samwise|Sam`
against the text `Samwise`. Most regex engines (that are Perl-like, or
non-POSIX) will report `Samwise` as a match, but the standard Aho-Corasick
algorithm modified for reporting non-overlapping matches will report `Sam`.
A novel contribution of this library is the ability to change the match
semantics of Aho-Corasick (without additional search time overhead) such that
`Samwise` is reported instead. For example, here's the standard approach:
```
use aho_corasick::AhoCorasick;
let patterns = &["Samwise", "Sam"];
let haystack = "Samwise";
let ac = AhoCorasick::new(patterns).unwrap();
let mat = ac.find(haystack).expect("should have a match");
assert_eq!("Sam", &haystack[mat.start()..mat.end()]);
```
And now here's the leftmost-first version, which matches how a Perl-like
regex will work:
```
use aho_corasick::{AhoCorasick, MatchKind};
let patterns = &["Samwise", "Sam"];
let haystack = "Samwise";
let ac = AhoCorasick::builder()
.match_kind(MatchKind::LeftmostFirst)
.build(patterns)
.unwrap();
let mat = ac.find(haystack).expect("should have a match");
assert_eq!("Samwise", &haystack[mat.start()..mat.end()]);
```
In addition to leftmost-first semantics, this library also supports
leftmost-longest semantics, which match the POSIX behavior of a regular
expression alternation. See [`MatchKind`] for more details.
# Prefilters
While an Aho-Corasick automaton can perform admirably when compared to more
naive solutions, it is generally slower than more specialized algorithms that
are accelerated using vector instructions such as SIMD.
For that reason, this library will internally use a "prefilter" to attempt
to accelerate searches when possible. Currently, this library has several
different algorithms it might use depending on the patterns provided. Once the
number of patterns gets too big, prefilters are no longer used.
While a prefilter is generally good to have on by default since it works
well in the common case, it can lead to less predictable or even sub-optimal
performance in some cases. For that reason, prefilters can be explicitly
disabled via [`AhoCorasickBuilder::prefilter`].
# Lower level APIs
This crate also provides several sub-modules that collectively expose many of
the implementation details of the main [`AhoCorasick`] type. Most users of this
library can completely ignore the submodules and their contents, but if you
needed finer grained control, some parts of them may be useful to you. Here is
a brief overview of each and why you might want to use them:
* The [`packed`] sub-module contains a lower level API for using fast
vectorized routines for finding a small number of patterns in a haystack.
You might want to use this API when you want to completely side-step using
Aho-Corasick automata. Otherwise, the fast vectorized routines are used
automatically as prefilters for `AhoCorasick` searches whenever possible.
* The [`automaton`] sub-module provides a lower level finite state
machine interface that the various Aho-Corasick implementations in
this crate implement. This sub-module's main contribution is the
[`Automaton`](automaton::Automaton) trait, which permits manually walking the
state transitions of an Aho-Corasick automaton.
* The [`dfa`] and [`nfa`] sub-modules provide DFA and NFA implementations of
the aforementioned `Automaton` trait. The main reason one might want to use
these sub-modules is to get access to a type that implements the `Automaton`
trait. (The top-level `AhoCorasick` type does not implement the `Automaton`
trait.)
As mentioned above, if you aren't sure whether you need these sub-modules,
you should be able to safely ignore them and just focus on the [`AhoCorasick`]
type.
# Crate features
This crate exposes a few features for controlling dependency usage and whether
this crate can be used without the standard library.
* **std** -
Enables support for the standard library. This feature is enabled by
default. When disabled, only `core` and `alloc` are used. At an API
level, enabling `std` enables `std::error::Error` trait impls for the
various error types, and higher level stream search routines such as
[`AhoCorasick::try_stream_find_iter`]. But the `std` feature is also required
to enable vectorized prefilters. Prefilters can greatly accelerate searches,
but generally only apply when the number of patterns is small (less than
~100).
* **perf-literal** -
Enables support for literal prefilters that use vectorized routines from
external crates. This feature is enabled by default. If you're only using
Aho-Corasick for large numbers of patterns or otherwise can abide lower
throughput when searching with a small number of patterns, then it is
reasonable to disable this feature.
* **logging** -
Enables a dependency on the `log` crate and emits messages to aide in
diagnostics. This feature is disabled by default.
*/
#![no_std]
#![deny(missing_docs)]
#![deny(rustdoc::broken_intra_doc_links)]
#![cfg_attr(docsrs, feature(doc_auto_cfg))]
extern crate alloc;
#[cfg(any(test, feature = "std"))]
extern crate std;
#[cfg(doctest)]
doc_comment::doctest!("../README.md");
#[cfg(feature = "std")]
pub use crate::ahocorasick::StreamFindIter;
pub use crate::{
ahocorasick::{
AhoCorasick, AhoCorasickBuilder, AhoCorasickKind, FindIter,
FindOverlappingIter,
},
util::{
error::{BuildError, MatchError, MatchErrorKind},
primitives::{PatternID, PatternIDError},
search::{Anchored, Input, Match, MatchKind, Span, StartKind},
},
};
#[macro_use]
mod macros;
mod ahocorasick;
pub mod automaton;
pub mod dfa;
pub mod nfa;
pub mod packed;
#[cfg(test)]
mod tests;
// I wrote out the module for implementing fst::Automaton only to later realize
// that this would make fst a public dependency and fst is not at 1.0 yet. I
// decided to just keep the code in tree, but build it only during tests.
//
// TODO: I think I've changed my mind again. I'm considering pushing it out
// into either a separate crate or into 'fst' directly as an optional feature.
// #[cfg(test)]
// #[allow(dead_code)]
// mod transducer;
pub(crate) mod util;
#[cfg(test)]
mod testoibits {
use std::panic::{RefUnwindSafe, UnwindSafe};
use super::*;
fn assert_all<T: Send + Sync + UnwindSafe + RefUnwindSafe>() {}
#[test]
fn oibits_main() {
assert_all::<AhoCorasick>();
assert_all::<AhoCorasickBuilder>();
assert_all::<AhoCorasickKind>();
assert_all::<FindIter>();
assert_all::<FindOverlappingIter>();
assert_all::<BuildError>();
assert_all::<MatchError>();
assert_all::<MatchErrorKind>();
assert_all::<Anchored>();
assert_all::<Input>();
assert_all::<Match>();
assert_all::<MatchKind>();
assert_all::<Span>();
assert_all::<StartKind>();
}
#[test]
fn oibits_automaton() {
use crate::{automaton, dfa::DFA};
assert_all::<automaton::FindIter<DFA>>();
assert_all::<automaton::FindOverlappingIter<DFA>>();
#[cfg(feature = "std")]
assert_all::<automaton::StreamFindIter<DFA, std::io::Stdin>>();
assert_all::<automaton::OverlappingState>();
assert_all::<automaton::Prefilter>();
assert_all::<automaton::Candidate>();
}
#[test]
fn oibits_packed() {
use crate::packed;
assert_all::<packed::Config>();
assert_all::<packed::Builder>();
assert_all::<packed::Searcher>();
assert_all::<packed::FindIter>();
assert_all::<packed::MatchKind>();
}
}

18
vendor/aho-corasick/src/macros.rs vendored Normal file
View File

@ -0,0 +1,18 @@
#![allow(unused_macros)]
macro_rules! log {
($($tt:tt)*) => {
#[cfg(feature = "logging")]
{
$($tt)*
}
}
}
macro_rules! debug {
($($tt:tt)*) => { log!(log::debug!($($tt)*)) }
}
macro_rules! trace {
($($tt:tt)*) => { log!(log::trace!($($tt)*)) }
}

1141
vendor/aho-corasick/src/nfa/contiguous.rs vendored Normal file

File diff suppressed because it is too large Load Diff

40
vendor/aho-corasick/src/nfa/mod.rs vendored Normal file
View File

@ -0,0 +1,40 @@
/*!
Provides direct access to NFA implementations of Aho-Corasick.
The principle characteristic of an NFA in this crate is that it may
transition through multiple states per byte of haystack. In Aho-Corasick
parlance, NFAs follow failure transitions during a search. In contrast,
a [`DFA`](crate::dfa::DFA) pre-computes all failure transitions during
compilation at the expense of a much bigger memory footprint.
Currently, there are two NFA implementations provided: noncontiguous and
contiguous. The names reflect their internal representation, and consequently,
the trade offs associated with them:
* A [`noncontiguous::NFA`] uses a separate allocation for every NFA state to
represent its transitions in a sparse format. This is ideal for building an
NFA, since it cheaply permits different states to have a different number of
transitions. A noncontiguous NFA is where the main Aho-Corasick construction
algorithm is implemented. All other Aho-Corasick implementations are built by
first constructing a noncontiguous NFA.
* A [`contiguous::NFA`] is uses a single allocation to represent all states,
while still encoding most states as sparse states but permitting states near
the starting state to have a dense representation. The dense representation
uses more memory, but permits computing transitions during a search more
quickly. By only making the most active states dense (the states near the
starting state), a contiguous NFA better balances memory usage with search
speed. The single contiguous allocation also uses less overhead per state and
enables compression tricks where most states only use 8 bytes of heap memory.
When given the choice between these two, you almost always want to pick a
contiguous NFA. It takes only a little longer to build, but both its memory
usage and search speed are typically much better than a noncontiguous NFA. A
noncontiguous NFA is useful when prioritizing build times, or when there are
so many patterns that a contiguous NFA could not be built. (Currently, because
of both memory and search speed improvements, a contiguous NFA has a smaller
internal limit on the total number of NFA states it can represent. But you
would likely need to have hundreds of thousands or even millions of patterns
before you hit this limit.)
*/
pub mod contiguous;
pub mod noncontiguous;

File diff suppressed because it is too large Load Diff

687
vendor/aho-corasick/src/packed/api.rs vendored Normal file
View File

@ -0,0 +1,687 @@
use alloc::sync::Arc;
use crate::{
packed::{pattern::Patterns, rabinkarp::RabinKarp, teddy},
util::search::{Match, Span},
};
/// This is a limit placed on the total number of patterns we're willing to try
/// and match at once. As more sophisticated algorithms are added, this number
/// may be increased.
const PATTERN_LIMIT: usize = 128;
/// A knob for controlling the match semantics of a packed multiple string
/// searcher.
///
/// This differs from the [`MatchKind`](crate::MatchKind) type in the top-level
/// crate module in that it doesn't support "standard" match semantics,
/// and instead only supports leftmost-first or leftmost-longest. Namely,
/// "standard" semantics cannot be easily supported by packed searchers.
///
/// For more information on the distinction between leftmost-first and
/// leftmost-longest, see the docs on the top-level `MatchKind` type.
///
/// Unlike the top-level `MatchKind` type, the default match semantics for this
/// type are leftmost-first.
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
#[non_exhaustive]
pub enum MatchKind {
/// Use leftmost-first match semantics, which reports leftmost matches.
/// When there are multiple possible leftmost matches, the match
/// corresponding to the pattern that appeared earlier when constructing
/// the automaton is reported.
///
/// This is the default.
LeftmostFirst,
/// Use leftmost-longest match semantics, which reports leftmost matches.
/// When there are multiple possible leftmost matches, the longest match
/// is chosen.
LeftmostLongest,
}
impl Default for MatchKind {
fn default() -> MatchKind {
MatchKind::LeftmostFirst
}
}
/// The configuration for a packed multiple pattern searcher.
///
/// The configuration is currently limited only to being able to select the
/// match semantics (leftmost-first or leftmost-longest) of a searcher. In the
/// future, more knobs may be made available.
///
/// A configuration produces a [`packed::Builder`](Builder), which in turn can
/// be used to construct a [`packed::Searcher`](Searcher) for searching.
///
/// # Example
///
/// This example shows how to use leftmost-longest semantics instead of the
/// default (leftmost-first).
///
/// ```
/// use aho_corasick::{packed::{Config, MatchKind}, PatternID};
///
/// # fn example() -> Option<()> {
/// let searcher = Config::new()
/// .match_kind(MatchKind::LeftmostLongest)
/// .builder()
/// .add("foo")
/// .add("foobar")
/// .build()?;
/// let matches: Vec<PatternID> = searcher
/// .find_iter("foobar")
/// .map(|mat| mat.pattern())
/// .collect();
/// assert_eq!(vec![PatternID::must(1)], matches);
/// # Some(()) }
/// # if cfg!(all(feature = "std", any(
/// # target_arch = "x86_64", target_arch = "aarch64",
/// # ))) {
/// # example().unwrap()
/// # } else {
/// # assert!(example().is_none());
/// # }
/// ```
#[derive(Clone, Debug)]
pub struct Config {
kind: MatchKind,
force: Option<ForceAlgorithm>,
only_teddy_fat: Option<bool>,
only_teddy_256bit: Option<bool>,
heuristic_pattern_limits: bool,
}
/// An internal option for forcing the use of a particular packed algorithm.
///
/// When an algorithm is forced, if a searcher could not be constructed for it,
/// then no searcher will be returned even if an alternative algorithm would
/// work.
#[derive(Clone, Debug)]
enum ForceAlgorithm {
Teddy,
RabinKarp,
}
impl Default for Config {
fn default() -> Config {
Config::new()
}
}
impl Config {
/// Create a new default configuration. A default configuration uses
/// leftmost-first match semantics.
pub fn new() -> Config {
Config {
kind: MatchKind::LeftmostFirst,
force: None,
only_teddy_fat: None,
only_teddy_256bit: None,
heuristic_pattern_limits: true,
}
}
/// Create a packed builder from this configuration. The builder can be
/// used to accumulate patterns and create a [`Searcher`] from them.
pub fn builder(&self) -> Builder {
Builder::from_config(self.clone())
}
/// Set the match semantics for this configuration.
pub fn match_kind(&mut self, kind: MatchKind) -> &mut Config {
self.kind = kind;
self
}
/// An undocumented method for forcing the use of the Teddy algorithm.
///
/// This is only exposed for more precise testing and benchmarks. Callers
/// should not use it as it is not part of the API stability guarantees of
/// this crate.
#[doc(hidden)]
pub fn only_teddy(&mut self, yes: bool) -> &mut Config {
if yes {
self.force = Some(ForceAlgorithm::Teddy);
} else {
self.force = None;
}
self
}
/// An undocumented method for forcing the use of the Fat Teddy algorithm.
///
/// This is only exposed for more precise testing and benchmarks. Callers
/// should not use it as it is not part of the API stability guarantees of
/// this crate.
#[doc(hidden)]
pub fn only_teddy_fat(&mut self, yes: Option<bool>) -> &mut Config {
self.only_teddy_fat = yes;
self
}
/// An undocumented method for forcing the use of SSE (`Some(false)`) or
/// AVX (`Some(true)`) algorithms.
///
/// This is only exposed for more precise testing and benchmarks. Callers
/// should not use it as it is not part of the API stability guarantees of
/// this crate.
#[doc(hidden)]
pub fn only_teddy_256bit(&mut self, yes: Option<bool>) -> &mut Config {
self.only_teddy_256bit = yes;
self
}
/// An undocumented method for forcing the use of the Rabin-Karp algorithm.
///
/// This is only exposed for more precise testing and benchmarks. Callers
/// should not use it as it is not part of the API stability guarantees of
/// this crate.
#[doc(hidden)]
pub fn only_rabin_karp(&mut self, yes: bool) -> &mut Config {
if yes {
self.force = Some(ForceAlgorithm::RabinKarp);
} else {
self.force = None;
}
self
}
/// Request that heuristic limitations on the number of patterns be
/// employed. This useful to disable for benchmarking where one wants to
/// explore how Teddy performs on large number of patterns even if the
/// heuristics would otherwise refuse construction.
///
/// This is enabled by default.
pub fn heuristic_pattern_limits(&mut self, yes: bool) -> &mut Config {
self.heuristic_pattern_limits = yes;
self
}
}
/// A builder for constructing a packed searcher from a collection of patterns.
///
/// # Example
///
/// This example shows how to use a builder to construct a searcher. By
/// default, leftmost-first match semantics are used.
///
/// ```
/// use aho_corasick::{packed::{Builder, MatchKind}, PatternID};
///
/// # fn example() -> Option<()> {
/// let searcher = Builder::new()
/// .add("foobar")
/// .add("foo")
/// .build()?;
/// let matches: Vec<PatternID> = searcher
/// .find_iter("foobar")
/// .map(|mat| mat.pattern())
/// .collect();
/// assert_eq!(vec![PatternID::ZERO], matches);
/// # Some(()) }
/// # if cfg!(all(feature = "std", any(
/// # target_arch = "x86_64", target_arch = "aarch64",
/// # ))) {
/// # example().unwrap()
/// # } else {
/// # assert!(example().is_none());
/// # }
/// ```
#[derive(Clone, Debug)]
pub struct Builder {
/// The configuration of this builder and subsequent matcher.
config: Config,
/// Set to true if the builder detects that a matcher cannot be built.
inert: bool,
/// The patterns provided by the caller.
patterns: Patterns,
}
impl Builder {
/// Create a new builder for constructing a multi-pattern searcher. This
/// constructor uses the default configuration.
pub fn new() -> Builder {
Builder::from_config(Config::new())
}
fn from_config(config: Config) -> Builder {
Builder { config, inert: false, patterns: Patterns::new() }
}
/// Build a searcher from the patterns added to this builder so far.
pub fn build(&self) -> Option<Searcher> {
if self.inert || self.patterns.is_empty() {
return None;
}
let mut patterns = self.patterns.clone();
patterns.set_match_kind(self.config.kind);
let patterns = Arc::new(patterns);
let rabinkarp = RabinKarp::new(&patterns);
// Effectively, we only want to return a searcher if we can use Teddy,
// since Teddy is our only fast packed searcher at the moment.
// Rabin-Karp is only used when searching haystacks smaller than what
// Teddy can support. Thus, the only way to get a Rabin-Karp searcher
// is to force it using undocumented APIs (for tests/benchmarks).
let (search_kind, minimum_len) = match self.config.force {
None | Some(ForceAlgorithm::Teddy) => {
debug!("trying to build Teddy packed matcher");
let teddy = match self.build_teddy(Arc::clone(&patterns)) {
None => return None,
Some(teddy) => teddy,
};
let minimum_len = teddy.minimum_len();
(SearchKind::Teddy(teddy), minimum_len)
}
Some(ForceAlgorithm::RabinKarp) => {
debug!("using Rabin-Karp packed matcher");
(SearchKind::RabinKarp, 0)
}
};
Some(Searcher { patterns, rabinkarp, search_kind, minimum_len })
}
fn build_teddy(&self, patterns: Arc<Patterns>) -> Option<teddy::Searcher> {
teddy::Builder::new()
.only_256bit(self.config.only_teddy_256bit)
.only_fat(self.config.only_teddy_fat)
.heuristic_pattern_limits(self.config.heuristic_pattern_limits)
.build(patterns)
}
/// Add the given pattern to this set to match.
///
/// The order in which patterns are added is significant. Namely, when
/// using leftmost-first match semantics, then when multiple patterns can
/// match at a particular location, the pattern that was added first is
/// used as the match.
///
/// If the number of patterns added exceeds the amount supported by packed
/// searchers, then the builder will stop accumulating patterns and render
/// itself inert. At this point, constructing a searcher will always return
/// `None`.
pub fn add<P: AsRef<[u8]>>(&mut self, pattern: P) -> &mut Builder {
if self.inert {
return self;
} else if self.patterns.len() >= PATTERN_LIMIT {
self.inert = true;
self.patterns.reset();
return self;
}
// Just in case PATTERN_LIMIT increases beyond u16::MAX.
assert!(self.patterns.len() <= core::u16::MAX as usize);
let pattern = pattern.as_ref();
if pattern.is_empty() {
self.inert = true;
self.patterns.reset();
return self;
}
self.patterns.add(pattern);
self
}
/// Add the given iterator of patterns to this set to match.
///
/// The iterator must yield elements that can be converted into a `&[u8]`.
///
/// The order in which patterns are added is significant. Namely, when
/// using leftmost-first match semantics, then when multiple patterns can
/// match at a particular location, the pattern that was added first is
/// used as the match.
///
/// If the number of patterns added exceeds the amount supported by packed
/// searchers, then the builder will stop accumulating patterns and render
/// itself inert. At this point, constructing a searcher will always return
/// `None`.
pub fn extend<I, P>(&mut self, patterns: I) -> &mut Builder
where
I: IntoIterator<Item = P>,
P: AsRef<[u8]>,
{
for p in patterns {
self.add(p);
}
self
}
/// Returns the number of patterns added to this builder.
pub fn len(&self) -> usize {
self.patterns.len()
}
/// Returns the length, in bytes, of the shortest pattern added.
pub fn minimum_len(&self) -> usize {
self.patterns.minimum_len()
}
}
impl Default for Builder {
fn default() -> Builder {
Builder::new()
}
}
/// A packed searcher for quickly finding occurrences of multiple patterns.
///
/// If callers need more flexible construction, or if one wants to change the
/// match semantics (either leftmost-first or leftmost-longest), then one can
/// use the [`Config`] and/or [`Builder`] types for more fine grained control.
///
/// # Example
///
/// This example shows how to create a searcher from an iterator of patterns.
/// By default, leftmost-first match semantics are used.
///
/// ```
/// use aho_corasick::{packed::{MatchKind, Searcher}, PatternID};
///
/// # fn example() -> Option<()> {
/// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
/// let matches: Vec<PatternID> = searcher
/// .find_iter("foobar")
/// .map(|mat| mat.pattern())
/// .collect();
/// assert_eq!(vec![PatternID::ZERO], matches);
/// # Some(()) }
/// # if cfg!(all(feature = "std", any(
/// # target_arch = "x86_64", target_arch = "aarch64",
/// # ))) {
/// # example().unwrap()
/// # } else {
/// # assert!(example().is_none());
/// # }
/// ```
#[derive(Clone, Debug)]
pub struct Searcher {
patterns: Arc<Patterns>,
rabinkarp: RabinKarp,
search_kind: SearchKind,
minimum_len: usize,
}
#[derive(Clone, Debug)]
enum SearchKind {
Teddy(teddy::Searcher),
RabinKarp,
}
impl Searcher {
/// A convenience function for constructing a searcher from an iterator
/// of things that can be converted to a `&[u8]`.
///
/// If a searcher could not be constructed (either because of an
/// unsupported CPU or because there are too many patterns), then `None`
/// is returned.
///
/// # Example
///
/// Basic usage:
///
/// ```
/// use aho_corasick::{packed::{MatchKind, Searcher}, PatternID};
///
/// # fn example() -> Option<()> {
/// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
/// let matches: Vec<PatternID> = searcher
/// .find_iter("foobar")
/// .map(|mat| mat.pattern())
/// .collect();
/// assert_eq!(vec![PatternID::ZERO], matches);
/// # Some(()) }
/// # if cfg!(all(feature = "std", any(
/// # target_arch = "x86_64", target_arch = "aarch64",
/// # ))) {
/// # example().unwrap()
/// # } else {
/// # assert!(example().is_none());
/// # }
/// ```
pub fn new<I, P>(patterns: I) -> Option<Searcher>
where
I: IntoIterator<Item = P>,
P: AsRef<[u8]>,
{
Builder::new().extend(patterns).build()
}
/// A convenience function for calling `Config::new()`.
///
/// This is useful for avoiding an additional import.
pub fn config() -> Config {
Config::new()
}
/// A convenience function for calling `Builder::new()`.
///
/// This is useful for avoiding an additional import.
pub fn builder() -> Builder {
Builder::new()
}
/// Return the first occurrence of any of the patterns in this searcher,
/// according to its match semantics, in the given haystack. The `Match`
/// returned will include the identifier of the pattern that matched, which
/// corresponds to the index of the pattern (starting from `0`) in which it
/// was added.
///
/// # Example
///
/// Basic usage:
///
/// ```
/// use aho_corasick::{packed::{MatchKind, Searcher}, PatternID};
///
/// # fn example() -> Option<()> {
/// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
/// let mat = searcher.find("foobar")?;
/// assert_eq!(PatternID::ZERO, mat.pattern());
/// assert_eq!(0, mat.start());
/// assert_eq!(6, mat.end());
/// # Some(()) }
/// # if cfg!(all(feature = "std", any(
/// # target_arch = "x86_64", target_arch = "aarch64",
/// # ))) {
/// # example().unwrap()
/// # } else {
/// # assert!(example().is_none());
/// # }
/// ```
#[inline]
pub fn find<B: AsRef<[u8]>>(&self, haystack: B) -> Option<Match> {
let haystack = haystack.as_ref();
self.find_in(haystack, Span::from(0..haystack.len()))
}
/// Return the first occurrence of any of the patterns in this searcher,
/// according to its match semantics, in the given haystack starting from
/// the given position.
///
/// The `Match` returned will include the identifier of the pattern that
/// matched, which corresponds to the index of the pattern (starting from
/// `0`) in which it was added. The offsets in the `Match` will be relative
/// to the start of `haystack` (and not `at`).
///
/// # Example
///
/// Basic usage:
///
/// ```
/// use aho_corasick::{packed::{MatchKind, Searcher}, PatternID, Span};
///
/// # fn example() -> Option<()> {
/// let haystack = "foofoobar";
/// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
/// let mat = searcher.find_in(haystack, Span::from(3..haystack.len()))?;
/// assert_eq!(PatternID::ZERO, mat.pattern());
/// assert_eq!(3, mat.start());
/// assert_eq!(9, mat.end());
/// # Some(()) }
/// # if cfg!(all(feature = "std", any(
/// # target_arch = "x86_64", target_arch = "aarch64",
/// # ))) {
/// # example().unwrap()
/// # } else {
/// # assert!(example().is_none());
/// # }
/// ```
#[inline]
pub fn find_in<B: AsRef<[u8]>>(
&self,
haystack: B,
span: Span,
) -> Option<Match> {
let haystack = haystack.as_ref();
match self.search_kind {
SearchKind::Teddy(ref teddy) => {
if haystack[span].len() < teddy.minimum_len() {
return self.find_in_slow(haystack, span);
}
teddy.find(&haystack[..span.end], span.start)
}
SearchKind::RabinKarp => {
self.rabinkarp.find_at(&haystack[..span.end], span.start)
}
}
}
/// Return an iterator of non-overlapping occurrences of the patterns in
/// this searcher, according to its match semantics, in the given haystack.
///
/// # Example
///
/// Basic usage:
///
/// ```
/// use aho_corasick::{packed::{MatchKind, Searcher}, PatternID};
///
/// # fn example() -> Option<()> {
/// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
/// let matches: Vec<PatternID> = searcher
/// .find_iter("foobar fooba foofoo")
/// .map(|mat| mat.pattern())
/// .collect();
/// assert_eq!(vec![
/// PatternID::must(0),
/// PatternID::must(1),
/// PatternID::must(1),
/// PatternID::must(1),
/// ], matches);
/// # Some(()) }
/// # if cfg!(all(feature = "std", any(
/// # target_arch = "x86_64", target_arch = "aarch64",
/// # ))) {
/// # example().unwrap()
/// # } else {
/// # assert!(example().is_none());
/// # }
/// ```
#[inline]
pub fn find_iter<'a, 'b, B: ?Sized + AsRef<[u8]>>(
&'a self,
haystack: &'b B,
) -> FindIter<'a, 'b> {
let haystack = haystack.as_ref();
let span = Span::from(0..haystack.len());
FindIter { searcher: self, haystack, span }
}
/// Returns the match kind used by this packed searcher.
///
/// # Examples
///
/// Basic usage:
///
/// ```
/// use aho_corasick::packed::{MatchKind, Searcher};
///
/// # fn example() -> Option<()> {
/// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
/// // leftmost-first is the default.
/// assert_eq!(&MatchKind::LeftmostFirst, searcher.match_kind());
/// # Some(()) }
/// # if cfg!(all(feature = "std", any(
/// # target_arch = "x86_64", target_arch = "aarch64",
/// # ))) {
/// # example().unwrap()
/// # } else {
/// # assert!(example().is_none());
/// # }
/// ```
#[inline]
pub fn match_kind(&self) -> &MatchKind {
self.patterns.match_kind()
}
/// Returns the minimum length of a haystack that is required in order for
/// packed searching to be effective.
///
/// In some cases, the underlying packed searcher may not be able to search
/// very short haystacks. When that occurs, the implementation will defer
/// to a slower non-packed searcher (which is still generally faster than
/// Aho-Corasick for a small number of patterns). However, callers may
/// want to avoid ever using the slower variant, which one can do by
/// never passing a haystack shorter than the minimum length returned by
/// this method.
#[inline]
pub fn minimum_len(&self) -> usize {
self.minimum_len
}
/// Returns the approximate total amount of heap used by this searcher, in
/// units of bytes.
#[inline]
pub fn memory_usage(&self) -> usize {
self.patterns.memory_usage()
+ self.rabinkarp.memory_usage()
+ self.search_kind.memory_usage()
}
/// Use a slow (non-packed) searcher.
///
/// This is useful when a packed searcher could be constructed, but could
/// not be used to search a specific haystack. For example, if Teddy was
/// built but the haystack is smaller than ~34 bytes, then Teddy might not
/// be able to run.
fn find_in_slow(&self, haystack: &[u8], span: Span) -> Option<Match> {
self.rabinkarp.find_at(&haystack[..span.end], span.start)
}
}
impl SearchKind {
fn memory_usage(&self) -> usize {
match *self {
SearchKind::Teddy(ref ted) => ted.memory_usage(),
SearchKind::RabinKarp => 0,
}
}
}
/// An iterator over non-overlapping matches from a packed searcher.
///
/// The lifetime `'s` refers to the lifetime of the underlying [`Searcher`],
/// while the lifetime `'h` refers to the lifetime of the haystack being
/// searched.
#[derive(Debug)]
pub struct FindIter<'s, 'h> {
searcher: &'s Searcher,
haystack: &'h [u8],
span: Span,
}
impl<'s, 'h> Iterator for FindIter<'s, 'h> {
type Item = Match;
fn next(&mut self) -> Option<Match> {
if self.span.start > self.span.end {
return None;
}
match self.searcher.find_in(&self.haystack, self.span) {
None => None,
Some(m) => {
self.span.start = m.end();
Some(m)
}
}
}
}

39
vendor/aho-corasick/src/packed/ext.rs vendored Normal file
View File

@ -0,0 +1,39 @@
/// A trait for adding some helper routines to pointers.
pub(crate) trait Pointer {
/// Returns the distance, in units of `T`, between `self` and `origin`.
///
/// # Safety
///
/// Same as `ptr::offset_from` in addition to `self >= origin`.
unsafe fn distance(self, origin: Self) -> usize;
/// Casts this pointer to `usize`.
///
/// Callers should not convert the `usize` back to a pointer if at all
/// possible. (And if you believe it's necessary, open an issue to discuss
/// why. Otherwise, it has the potential to violate pointer provenance.)
/// The purpose of this function is just to be able to do arithmetic, i.e.,
/// computing offsets or alignments.
fn as_usize(self) -> usize;
}
impl<T> Pointer for *const T {
unsafe fn distance(self, origin: *const T) -> usize {
// TODO: Replace with `ptr::sub_ptr` once stabilized.
usize::try_from(self.offset_from(origin)).unwrap_unchecked()
}
fn as_usize(self) -> usize {
self as usize
}
}
impl<T> Pointer for *mut T {
unsafe fn distance(self, origin: *mut T) -> usize {
(self as *const T).distance(origin as *const T)
}
fn as_usize(self) -> usize {
(self as *const T).as_usize()
}
}

120
vendor/aho-corasick/src/packed/mod.rs vendored Normal file
View File

@ -0,0 +1,120 @@
/*!
Provides packed multiple substring search, principally for a small number of
patterns.
This sub-module provides vectorized routines for quickly finding
matches of a small number of patterns. In general, users of this crate
shouldn't need to interface with this module directly, as the primary
[`AhoCorasick`](crate::AhoCorasick) searcher will use these routines
automatically as a prefilter when applicable. However, in some cases, callers
may want to bypass the Aho-Corasick machinery entirely and use this vectorized
searcher directly.
# Overview
The primary types in this sub-module are:
* [`Searcher`] executes the actual search algorithm to report matches in a
haystack.
* [`Builder`] accumulates patterns incrementally and can construct a
`Searcher`.
* [`Config`] permits tuning the searcher, and itself will produce a `Builder`
(which can then be used to build a `Searcher`). Currently, the only tuneable
knob are the match semantics, but this may be expanded in the future.
# Examples
This example shows how to create a searcher from an iterator of patterns.
By default, leftmost-first match semantics are used. (See the top-level
[`MatchKind`] type for more details about match semantics, which apply
similarly to packed substring search.)
```
use aho_corasick::{packed::{MatchKind, Searcher}, PatternID};
# fn example() -> Option<()> {
let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
let matches: Vec<PatternID> = searcher
.find_iter("foobar")
.map(|mat| mat.pattern())
.collect();
assert_eq!(vec![PatternID::ZERO], matches);
# Some(()) }
# if cfg!(all(feature = "std", any(
# target_arch = "x86_64", target_arch = "aarch64",
# ))) {
# example().unwrap()
# } else {
# assert!(example().is_none());
# }
```
This example shows how to use [`Config`] to change the match semantics to
leftmost-longest:
```
use aho_corasick::{packed::{Config, MatchKind}, PatternID};
# fn example() -> Option<()> {
let searcher = Config::new()
.match_kind(MatchKind::LeftmostLongest)
.builder()
.add("foo")
.add("foobar")
.build()?;
let matches: Vec<PatternID> = searcher
.find_iter("foobar")
.map(|mat| mat.pattern())
.collect();
assert_eq!(vec![PatternID::must(1)], matches);
# Some(()) }
# if cfg!(all(feature = "std", any(
# target_arch = "x86_64", target_arch = "aarch64",
# ))) {
# example().unwrap()
# } else {
# assert!(example().is_none());
# }
```
# Packed substring searching
Packed substring searching refers to the use of SIMD (Single Instruction,
Multiple Data) to accelerate the detection of matches in a haystack. Unlike
conventional algorithms, such as Aho-Corasick, SIMD algorithms for substring
search tend to do better with a small number of patterns, where as Aho-Corasick
generally maintains reasonably consistent performance regardless of the number
of patterns you give it. Because of this, the vectorized searcher in this
sub-module cannot be used as a general purpose searcher, since building the
searcher may fail even when given a small number of patterns. However, in
exchange, when searching for a small number of patterns, searching can be quite
a bit faster than Aho-Corasick (sometimes by an order of magnitude).
The key take away here is that constructing a searcher from a list of patterns
is a fallible operation with no clear rules for when it will fail. While the
precise conditions under which building a searcher can fail is specifically an
implementation detail, here are some common reasons:
* Too many patterns were given. Typically, the limit is on the order of 100 or
so, but this limit may fluctuate based on available CPU features.
* The available packed algorithms require CPU features that aren't available.
For example, currently, this crate only provides packed algorithms for
`x86_64` and `aarch64`. Therefore, constructing a packed searcher on any
other target will always fail.
* Zero patterns were given, or one of the patterns given was empty. Packed
searchers require at least one pattern and that all patterns are non-empty.
* Something else about the nature of the patterns (typically based on
heuristics) suggests that a packed searcher would perform very poorly, so
no searcher is built.
*/
pub use crate::packed::api::{Builder, Config, FindIter, MatchKind, Searcher};
mod api;
mod ext;
mod pattern;
mod rabinkarp;
mod teddy;
#[cfg(all(feature = "std", test))]
mod tests;
mod vector;

View File

@ -0,0 +1,480 @@
use core::{cmp, fmt, mem, u16, usize};
use alloc::{boxed::Box, string::String, vec, vec::Vec};
use crate::{
packed::{api::MatchKind, ext::Pointer},
PatternID,
};
/// A non-empty collection of non-empty patterns to search for.
///
/// This collection of patterns is what is passed around to both execute
/// searches and to construct the searchers themselves. Namely, this permits
/// searches to avoid copying all of the patterns, and allows us to keep only
/// one copy throughout all packed searchers.
///
/// Note that this collection is not a set. The same pattern can appear more
/// than once.
#[derive(Clone, Debug)]
pub(crate) struct Patterns {
/// The match semantics supported by this collection of patterns.
///
/// The match semantics determines the order of the iterator over patterns.
/// For leftmost-first, patterns are provided in the same order as were
/// provided by the caller. For leftmost-longest, patterns are provided in
/// descending order of length, with ties broken by the order in which they
/// were provided by the caller.
kind: MatchKind,
/// The collection of patterns, indexed by their identifier.
by_id: Vec<Vec<u8>>,
/// The order of patterns defined for iteration, given by pattern
/// identifiers. The order of `by_id` and `order` is always the same for
/// leftmost-first semantics, but may be different for leftmost-longest
/// semantics.
order: Vec<PatternID>,
/// The length of the smallest pattern, in bytes.
minimum_len: usize,
/// The total number of pattern bytes across the entire collection. This
/// is used for reporting total heap usage in constant time.
total_pattern_bytes: usize,
}
// BREADCRUMBS: I think we want to experiment with a different bucket
// representation. Basically, each bucket is just a Range<usize> to a single
// contiguous allocation? Maybe length-prefixed patterns or something? The
// idea is to try to get rid of the pointer chasing in verification. I don't
// know that that is the issue, but I suspect it is.
impl Patterns {
/// Create a new collection of patterns for the given match semantics. The
/// ID of each pattern is the index of the pattern at which it occurs in
/// the `by_id` slice.
///
/// If any of the patterns in the slice given are empty, then this panics.
/// Similarly, if the number of patterns given is zero, then this also
/// panics.
pub(crate) fn new() -> Patterns {
Patterns {
kind: MatchKind::default(),
by_id: vec![],
order: vec![],
minimum_len: usize::MAX,
total_pattern_bytes: 0,
}
}
/// Add a pattern to this collection.
///
/// This panics if the pattern given is empty.
pub(crate) fn add(&mut self, bytes: &[u8]) {
assert!(!bytes.is_empty());
assert!(self.by_id.len() <= u16::MAX as usize);
let id = PatternID::new(self.by_id.len()).unwrap();
self.order.push(id);
self.by_id.push(bytes.to_vec());
self.minimum_len = cmp::min(self.minimum_len, bytes.len());
self.total_pattern_bytes += bytes.len();
}
/// Set the match kind semantics for this collection of patterns.
///
/// If the kind is not set, then the default is leftmost-first.
pub(crate) fn set_match_kind(&mut self, kind: MatchKind) {
self.kind = kind;
match self.kind {
MatchKind::LeftmostFirst => {
self.order.sort();
}
MatchKind::LeftmostLongest => {
let (order, by_id) = (&mut self.order, &mut self.by_id);
order.sort_by(|&id1, &id2| {
by_id[id1].len().cmp(&by_id[id2].len()).reverse()
});
}
}
}
/// Return the number of patterns in this collection.
///
/// This is guaranteed to be greater than zero.
pub(crate) fn len(&self) -> usize {
self.by_id.len()
}
/// Returns true if and only if this collection of patterns is empty.
pub(crate) fn is_empty(&self) -> bool {
self.len() == 0
}
/// Returns the approximate total amount of heap used by these patterns, in
/// units of bytes.
pub(crate) fn memory_usage(&self) -> usize {
self.order.len() * mem::size_of::<PatternID>()
+ self.by_id.len() * mem::size_of::<Vec<u8>>()
+ self.total_pattern_bytes
}
/// Clears all heap memory associated with this collection of patterns and
/// resets all state such that it is a valid empty collection.
pub(crate) fn reset(&mut self) {
self.kind = MatchKind::default();
self.by_id.clear();
self.order.clear();
self.minimum_len = usize::MAX;
}
/// Returns the length, in bytes, of the smallest pattern.
///
/// This is guaranteed to be at least one.
pub(crate) fn minimum_len(&self) -> usize {
self.minimum_len
}
/// Returns the match semantics used by these patterns.
pub(crate) fn match_kind(&self) -> &MatchKind {
&self.kind
}
/// Return the pattern with the given identifier. If such a pattern does
/// not exist, then this panics.
pub(crate) fn get(&self, id: PatternID) -> Pattern<'_> {
Pattern(&self.by_id[id])
}
/// Return the pattern with the given identifier without performing bounds
/// checks.
///
/// # Safety
///
/// Callers must ensure that a pattern with the given identifier exists
/// before using this method.
pub(crate) unsafe fn get_unchecked(&self, id: PatternID) -> Pattern<'_> {
Pattern(self.by_id.get_unchecked(id.as_usize()))
}
/// Return an iterator over all the patterns in this collection, in the
/// order in which they should be matched.
///
/// Specifically, in a naive multi-pattern matcher, the following is
/// guaranteed to satisfy the match semantics of this collection of
/// patterns:
///
/// ```ignore
/// for i in 0..haystack.len():
/// for p in patterns.iter():
/// if haystack[i..].starts_with(p.bytes()):
/// return Match(p.id(), i, i + p.bytes().len())
/// ```
///
/// Namely, among the patterns in a collection, if they are matched in
/// the order provided by this iterator, then the result is guaranteed
/// to satisfy the correct match semantics. (Either leftmost-first or
/// leftmost-longest.)
pub(crate) fn iter(&self) -> PatternIter<'_> {
PatternIter { patterns: self, i: 0 }
}
}
/// An iterator over the patterns in the `Patterns` collection.
///
/// The order of the patterns provided by this iterator is consistent with the
/// match semantics of the originating collection of patterns.
///
/// The lifetime `'p` corresponds to the lifetime of the collection of patterns
/// this is iterating over.
#[derive(Debug)]
pub(crate) struct PatternIter<'p> {
patterns: &'p Patterns,
i: usize,
}
impl<'p> Iterator for PatternIter<'p> {
type Item = (PatternID, Pattern<'p>);
fn next(&mut self) -> Option<(PatternID, Pattern<'p>)> {
if self.i >= self.patterns.len() {
return None;
}
let id = self.patterns.order[self.i];
let p = self.patterns.get(id);
self.i += 1;
Some((id, p))
}
}
/// A pattern that is used in packed searching.
#[derive(Clone)]
pub(crate) struct Pattern<'a>(&'a [u8]);
impl<'a> fmt::Debug for Pattern<'a> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.debug_struct("Pattern")
.field("lit", &String::from_utf8_lossy(&self.0))
.finish()
}
}
impl<'p> Pattern<'p> {
/// Returns the length of this pattern, in bytes.
pub(crate) fn len(&self) -> usize {
self.0.len()
}
/// Returns the bytes of this pattern.
pub(crate) fn bytes(&self) -> &[u8] {
&self.0
}
/// Returns the first `len` low nybbles from this pattern. If this pattern
/// is shorter than `len`, then this panics.
pub(crate) fn low_nybbles(&self, len: usize) -> Box<[u8]> {
let mut nybs = vec![0; len].into_boxed_slice();
for (i, byte) in self.bytes().iter().take(len).enumerate() {
nybs[i] = byte & 0xF;
}
nybs
}
/// Returns true if this pattern is a prefix of the given bytes.
#[inline(always)]
pub(crate) fn is_prefix(&self, bytes: &[u8]) -> bool {
is_prefix(bytes, self.bytes())
}
/// Returns true if this pattern is a prefix of the haystack given by the
/// raw `start` and `end` pointers.
///
/// # Safety
///
/// * It must be the case that `start < end` and that the distance between
/// them is at least equal to `V::BYTES`. That is, it must always be valid
/// to do at least an unaligned load of `V` at `start`.
/// * Both `start` and `end` must be valid for reads.
/// * Both `start` and `end` must point to an initialized value.
/// * Both `start` and `end` must point to the same allocated object and
/// must either be in bounds or at most one byte past the end of the
/// allocated object.
/// * Both `start` and `end` must be _derived from_ a pointer to the same
/// object.
/// * The distance between `start` and `end` must not overflow `isize`.
/// * The distance being in bounds must not rely on "wrapping around" the
/// address space.
#[inline(always)]
pub(crate) unsafe fn is_prefix_raw(
&self,
start: *const u8,
end: *const u8,
) -> bool {
let patlen = self.bytes().len();
let haylen = end.distance(start);
if patlen > haylen {
return false;
}
// SAFETY: We've checked that the haystack has length at least equal
// to this pattern. All other safety concerns are the responsibility
// of the caller.
is_equal_raw(start, self.bytes().as_ptr(), patlen)
}
}
/// Returns true if and only if `needle` is a prefix of `haystack`.
///
/// This uses a latency optimized variant of `memcmp` internally which *might*
/// make this faster for very short strings.
///
/// # Inlining
///
/// This routine is marked `inline(always)`. If you want to call this function
/// in a way that is not always inlined, you'll need to wrap a call to it in
/// another function that is marked as `inline(never)` or just `inline`.
#[inline(always)]
fn is_prefix(haystack: &[u8], needle: &[u8]) -> bool {
if needle.len() > haystack.len() {
return false;
}
// SAFETY: Our pointers are derived directly from borrowed slices which
// uphold all of our safety guarantees except for length. We account for
// length with the check above.
unsafe { is_equal_raw(haystack.as_ptr(), needle.as_ptr(), needle.len()) }
}
/// Compare corresponding bytes in `x` and `y` for equality.
///
/// That is, this returns true if and only if `x.len() == y.len()` and
/// `x[i] == y[i]` for all `0 <= i < x.len()`.
///
/// Note that this isn't used. We only use it in tests as a convenient way
/// of testing `is_equal_raw`.
///
/// # Inlining
///
/// This routine is marked `inline(always)`. If you want to call this function
/// in a way that is not always inlined, you'll need to wrap a call to it in
/// another function that is marked as `inline(never)` or just `inline`.
///
/// # Motivation
///
/// Why not use slice equality instead? Well, slice equality usually results in
/// a call out to the current platform's `libc` which might not be inlineable
/// or have other overhead. This routine isn't guaranteed to be a win, but it
/// might be in some cases.
#[cfg(test)]
#[inline(always)]
fn is_equal(x: &[u8], y: &[u8]) -> bool {
if x.len() != y.len() {
return false;
}
// SAFETY: Our pointers are derived directly from borrowed slices which
// uphold all of our safety guarantees except for length. We account for
// length with the check above.
unsafe { is_equal_raw(x.as_ptr(), y.as_ptr(), x.len()) }
}
/// Compare `n` bytes at the given pointers for equality.
///
/// This returns true if and only if `*x.add(i) == *y.add(i)` for all
/// `0 <= i < n`.
///
/// # Inlining
///
/// This routine is marked `inline(always)`. If you want to call this function
/// in a way that is not always inlined, you'll need to wrap a call to it in
/// another function that is marked as `inline(never)` or just `inline`.
///
/// # Motivation
///
/// Why not use slice equality instead? Well, slice equality usually results in
/// a call out to the current platform's `libc` which might not be inlineable
/// or have other overhead. This routine isn't guaranteed to be a win, but it
/// might be in some cases.
///
/// # Safety
///
/// * Both `x` and `y` must be valid for reads of up to `n` bytes.
/// * Both `x` and `y` must point to an initialized value.
/// * Both `x` and `y` must each point to an allocated object and
/// must either be in bounds or at most one byte past the end of the
/// allocated object. `x` and `y` do not need to point to the same allocated
/// object, but they may.
/// * Both `x` and `y` must be _derived from_ a pointer to their respective
/// allocated objects.
/// * The distance between `x` and `x+n` must not overflow `isize`. Similarly
/// for `y` and `y+n`.
/// * The distance being in bounds must not rely on "wrapping around" the
/// address space.
#[inline(always)]
unsafe fn is_equal_raw(mut x: *const u8, mut y: *const u8, n: usize) -> bool {
// If we don't have enough bytes to do 4-byte at a time loads, then
// handle each possible length specially. Note that I used to have a
// byte-at-a-time loop here and that turned out to be quite a bit slower
// for the memmem/pathological/defeat-simple-vector-alphabet benchmark.
if n < 4 {
return match n {
0 => true,
1 => x.read() == y.read(),
2 => {
x.cast::<u16>().read_unaligned()
== y.cast::<u16>().read_unaligned()
}
// I also tried copy_nonoverlapping here and it looks like the
// codegen is the same.
3 => x.cast::<[u8; 3]>().read() == y.cast::<[u8; 3]>().read(),
_ => unreachable!(),
};
}
// When we have 4 or more bytes to compare, then proceed in chunks of 4 at
// a time using unaligned loads.
//
// Also, why do 4 byte loads instead of, say, 8 byte loads? The reason is
// that this particular version of memcmp is likely to be called with tiny
// needles. That means that if we do 8 byte loads, then a higher proportion
// of memcmp calls will use the slower variant above. With that said, this
// is a hypothesis and is only loosely supported by benchmarks. There's
// likely some improvement that could be made here. The main thing here
// though is to optimize for latency, not throughput.
// SAFETY: The caller is responsible for ensuring the pointers we get are
// valid and readable for at least `n` bytes. We also do unaligned loads,
// so there's no need to ensure we're aligned. (This is justified by this
// routine being specifically for short strings.)
let xend = x.add(n.wrapping_sub(4));
let yend = y.add(n.wrapping_sub(4));
while x < xend {
let vx = x.cast::<u32>().read_unaligned();
let vy = y.cast::<u32>().read_unaligned();
if vx != vy {
return false;
}
x = x.add(4);
y = y.add(4);
}
let vx = xend.cast::<u32>().read_unaligned();
let vy = yend.cast::<u32>().read_unaligned();
vx == vy
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn equals_different_lengths() {
assert!(!is_equal(b"", b"a"));
assert!(!is_equal(b"a", b""));
assert!(!is_equal(b"ab", b"a"));
assert!(!is_equal(b"a", b"ab"));
}
#[test]
fn equals_mismatch() {
let one_mismatch = [
(&b"a"[..], &b"x"[..]),
(&b"ab"[..], &b"ax"[..]),
(&b"abc"[..], &b"abx"[..]),
(&b"abcd"[..], &b"abcx"[..]),
(&b"abcde"[..], &b"abcdx"[..]),
(&b"abcdef"[..], &b"abcdex"[..]),
(&b"abcdefg"[..], &b"abcdefx"[..]),
(&b"abcdefgh"[..], &b"abcdefgx"[..]),
(&b"abcdefghi"[..], &b"abcdefghx"[..]),
(&b"abcdefghij"[..], &b"abcdefghix"[..]),
(&b"abcdefghijk"[..], &b"abcdefghijx"[..]),
(&b"abcdefghijkl"[..], &b"abcdefghijkx"[..]),
(&b"abcdefghijklm"[..], &b"abcdefghijklx"[..]),
(&b"abcdefghijklmn"[..], &b"abcdefghijklmx"[..]),
];
for (x, y) in one_mismatch {
assert_eq!(x.len(), y.len(), "lengths should match");
assert!(!is_equal(x, y));
assert!(!is_equal(y, x));
}
}
#[test]
fn equals_yes() {
assert!(is_equal(b"", b""));
assert!(is_equal(b"a", b"a"));
assert!(is_equal(b"ab", b"ab"));
assert!(is_equal(b"abc", b"abc"));
assert!(is_equal(b"abcd", b"abcd"));
assert!(is_equal(b"abcde", b"abcde"));
assert!(is_equal(b"abcdef", b"abcdef"));
assert!(is_equal(b"abcdefg", b"abcdefg"));
assert!(is_equal(b"abcdefgh", b"abcdefgh"));
assert!(is_equal(b"abcdefghi", b"abcdefghi"));
}
#[test]
fn prefix() {
assert!(is_prefix(b"", b""));
assert!(is_prefix(b"a", b""));
assert!(is_prefix(b"ab", b""));
assert!(is_prefix(b"foo", b"foo"));
assert!(is_prefix(b"foobar", b"foo"));
assert!(!is_prefix(b"foo", b"fob"));
assert!(!is_prefix(b"foobar", b"fob"));
}
}

View File

@ -0,0 +1,168 @@
use alloc::{sync::Arc, vec, vec::Vec};
use crate::{packed::pattern::Patterns, util::search::Match, PatternID};
/// The type of the rolling hash used in the Rabin-Karp algorithm.
type Hash = usize;
/// The number of buckets to store our patterns in. We don't want this to be
/// too big in order to avoid wasting memory, but we don't want it to be too
/// small either to avoid spending too much time confirming literals.
///
/// The number of buckets MUST be a power of two. Otherwise, determining the
/// bucket from a hash will slow down the code considerably. Using a power
/// of two means `hash % NUM_BUCKETS` can compile down to a simple `and`
/// instruction.
const NUM_BUCKETS: usize = 64;
/// An implementation of the Rabin-Karp algorithm. The main idea of this
/// algorithm is to maintain a rolling hash as it moves through the input, and
/// then check whether that hash corresponds to the same hash for any of the
/// patterns we're looking for.
///
/// A draw back of naively scaling Rabin-Karp to multiple patterns is that
/// it requires all of the patterns to be the same length, which in turn
/// corresponds to the number of bytes to hash. We adapt this to work for
/// multiple patterns of varying size by fixing the number of bytes to hash
/// to be the length of the smallest pattern. We also split the patterns into
/// several buckets to hopefully make the confirmation step faster.
///
/// Wikipedia has a decent explanation, if a bit heavy on the theory:
/// https://en.wikipedia.org/wiki/Rabin%E2%80%93Karp_algorithm
///
/// But ESMAJ provides something a bit more concrete:
/// https://www-igm.univ-mlv.fr/~lecroq/string/node5.html
#[derive(Clone, Debug)]
pub(crate) struct RabinKarp {
/// The patterns we're searching for.
patterns: Arc<Patterns>,
/// The order of patterns in each bucket is significant. Namely, they are
/// arranged such that the first one to match is the correct match. This
/// may not necessarily correspond to the order provided by the caller.
/// For example, if leftmost-longest semantics are used, then the patterns
/// are sorted by their length in descending order. If leftmost-first
/// semantics are used, then the patterns are sorted by their pattern ID
/// in ascending order (which corresponds to the caller's order).
buckets: Vec<Vec<(Hash, PatternID)>>,
/// The length of the hashing window. Generally, this corresponds to the
/// length of the smallest pattern.
hash_len: usize,
/// The factor to subtract out of a hash before updating it with a new
/// byte.
hash_2pow: usize,
}
impl RabinKarp {
/// Compile a new Rabin-Karp matcher from the patterns given.
///
/// This panics if any of the patterns in the collection are empty, or if
/// the collection is itself empty.
pub(crate) fn new(patterns: &Arc<Patterns>) -> RabinKarp {
assert!(patterns.len() >= 1);
let hash_len = patterns.minimum_len();
assert!(hash_len >= 1);
let mut hash_2pow = 1usize;
for _ in 1..hash_len {
hash_2pow = hash_2pow.wrapping_shl(1);
}
let mut rk = RabinKarp {
patterns: Arc::clone(patterns),
buckets: vec![vec![]; NUM_BUCKETS],
hash_len,
hash_2pow,
};
for (id, pat) in patterns.iter() {
let hash = rk.hash(&pat.bytes()[..rk.hash_len]);
let bucket = hash % NUM_BUCKETS;
rk.buckets[bucket].push((hash, id));
}
rk
}
/// Return the first matching pattern in the given haystack, begining the
/// search at `at`.
pub(crate) fn find_at(
&self,
haystack: &[u8],
mut at: usize,
) -> Option<Match> {
assert_eq!(NUM_BUCKETS, self.buckets.len());
if at + self.hash_len > haystack.len() {
return None;
}
let mut hash = self.hash(&haystack[at..at + self.hash_len]);
loop {
let bucket = &self.buckets[hash % NUM_BUCKETS];
for &(phash, pid) in bucket {
if phash == hash {
if let Some(c) = self.verify(pid, haystack, at) {
return Some(c);
}
}
}
if at + self.hash_len >= haystack.len() {
return None;
}
hash = self.update_hash(
hash,
haystack[at],
haystack[at + self.hash_len],
);
at += 1;
}
}
/// Returns the approximate total amount of heap used by this searcher, in
/// units of bytes.
pub(crate) fn memory_usage(&self) -> usize {
self.buckets.len() * core::mem::size_of::<Vec<(Hash, PatternID)>>()
+ self.patterns.len() * core::mem::size_of::<(Hash, PatternID)>()
}
/// Verify whether the pattern with the given id matches at
/// `haystack[at..]`.
///
/// We tag this function as `cold` because it helps improve codegen.
/// Intuitively, it would seem like inlining it would be better. However,
/// the only time this is called and a match is not found is when there
/// there is a hash collision, or when a prefix of a pattern matches but
/// the entire pattern doesn't match. This is hopefully fairly rare, and
/// if it does occur a lot, it's going to be slow no matter what we do.
#[cold]
fn verify(
&self,
id: PatternID,
haystack: &[u8],
at: usize,
) -> Option<Match> {
let pat = self.patterns.get(id);
if pat.is_prefix(&haystack[at..]) {
Some(Match::new(id, at..at + pat.len()))
} else {
None
}
}
/// Hash the given bytes.
fn hash(&self, bytes: &[u8]) -> Hash {
assert_eq!(self.hash_len, bytes.len());
let mut hash = 0usize;
for &b in bytes {
hash = hash.wrapping_shl(1).wrapping_add(b as usize);
}
hash
}
/// Update the hash given based on removing `old_byte` at the beginning
/// of some byte string, and appending `new_byte` to the end of that same
/// byte string.
fn update_hash(&self, prev: Hash, old_byte: u8, new_byte: u8) -> Hash {
prev.wrapping_sub((old_byte as usize).wrapping_mul(self.hash_2pow))
.wrapping_shl(1)
.wrapping_add(new_byte as usize)
}
}

View File

@ -0,0 +1,386 @@
Teddy is a SIMD accelerated multiple substring matching algorithm. The name
and the core ideas in the algorithm were learned from the [Hyperscan][1_u]
project. The implementation in this repository was mostly motivated for use in
accelerating regex searches by searching for small sets of required literals
extracted from the regex.
# Background
The key idea of Teddy is to do *packed* substring matching. In the literature,
packed substring matching is the idea of examining multiple bytes in a haystack
at a time to detect matches. Implementations of, for example, memchr (which
detects matches of a single byte) have been doing this for years. Only
recently, with the introduction of various SIMD instructions, has this been
extended to substring matching. The PCMPESTRI instruction (and its relatives),
for example, implements substring matching in hardware. It is, however, limited
to substrings of length 16 bytes or fewer, but this restriction is fine in a
regex engine, since we rarely care about the performance difference between
searching for a 16 byte literal and a 16 + N literal; 16 is already long
enough. The key downside of the PCMPESTRI instruction, on current (2016) CPUs
at least, is its latency and throughput. As a result, it is often faster to
do substring search with a Boyer-Moore (or Two-Way) variant and a well placed
memchr to quickly skip through the haystack.
There are fewer results from the literature on packed substring matching,
and even fewer for packed multiple substring matching. Ben-Kiki et al. [2]
describes use of PCMPESTRI for substring matching, but is mostly theoretical
and hand-waves performance. There is other theoretical work done by Bille [3]
as well.
The rest of the work in the field, as far as I'm aware, is by Faro and Kulekci
and is generally focused on multiple pattern search. Their first paper [4a]
introduces the concept of a fingerprint, which is computed for every block of
N bytes in every pattern. The haystack is then scanned N bytes at a time and
a fingerprint is computed in the same way it was computed for blocks in the
patterns. If the fingerprint corresponds to one that was found in a pattern,
then a verification step follows to confirm that one of the substrings with the
corresponding fingerprint actually matches at the current location. Various
implementation tricks are employed to make sure the fingerprint lookup is fast;
typically by truncating the fingerprint. (This may, of course, provoke more
steps in the verification process, so a balance must be struck.)
The main downside of [4a] is that the minimum substring length is 32 bytes,
presumably because of how the algorithm uses certain SIMD instructions. This
essentially makes it useless for general purpose regex matching, where a small
number of short patterns is far more likely.
Faro and Kulekci published another paper [4b] that is conceptually very similar
to [4a]. The key difference is that it uses the CRC32 instruction (introduced
as part of SSE 4.2) to compute fingerprint values. This also enables the
algorithm to work effectively on substrings as short as 7 bytes with 4 byte
windows. 7 bytes is unfortunately still too long. The window could be
technically shrunk to 2 bytes, thereby reducing minimum length to 3, but the
small window size ends up negating most performance benefits—and it's likely
the common case in a general purpose regex engine.
Faro and Kulekci also published [4c] that appears to be intended as a
replacement to using PCMPESTRI. In particular, it is specifically motivated by
the high throughput/latency time of PCMPESTRI and therefore chooses other SIMD
instructions that are faster. While this approach works for short substrings,
I personally couldn't see a way to generalize it to multiple substring search.
Faro and Kulekci have another paper [4d] that I haven't been able to read
because it is behind a paywall.
# Teddy
Finally, we get to Teddy. If the above literature review is complete, then it
appears that Teddy is a novel algorithm. More than that, in my experience, it
completely blows away the competition for short substrings, which is exactly
what we want in a general purpose regex engine. Again, the algorithm appears
to be developed by the authors of [Hyperscan][1_u]. Hyperscan was open sourced
late 2015, and no earlier history could be found. Therefore, tracking the exact
provenance of the algorithm with respect to the published literature seems
difficult.
At a high level, Teddy works somewhat similarly to the fingerprint algorithms
published by Faro and Kulekci, but Teddy does it in a way that scales a bit
better. Namely:
1. Teddy's core algorithm scans the haystack in 16 (for SSE, or 32 for AVX)
byte chunks. 16 (or 32) is significant because it corresponds to the number
of bytes in a SIMD vector.
2. Bitwise operations are performed on each chunk to discover if any region of
it matches a set of precomputed fingerprints from the patterns. If there are
matches, then a verification step is performed. In this implementation, our
verification step is naive. This can be improved upon.
The details to make this work are quite clever. First, we must choose how to
pick our fingerprints. In Hyperscan's implementation, I *believe* they use the
last N bytes of each substring, where N must be at least the minimum length of
any substring in the set being searched. In this implementation, we use the
first N bytes of each substring. (The tradeoffs between these choices aren't
yet clear to me.) We then must figure out how to quickly test whether an
occurrence of any fingerprint from the set of patterns appears in a 16 byte
block from the haystack. To keep things simple, let's assume N = 1 and examine
some examples to motivate the approach. Here are our patterns:
```ignore
foo
bar
baz
```
The corresponding fingerprints, for N = 1, are `f`, `b` and `b`. Now let's set
our 16 byte block to:
```ignore
bat cat foo bump
xxxxxxxxxxxxxxxx
```
To cut to the chase, Teddy works by using bitsets. In particular, Teddy creates
a mask that allows us to quickly compute membership of a fingerprint in a 16
byte block that also tells which pattern the fingerprint corresponds to. In
this case, our fingerprint is a single byte, so an appropriate abstraction is
a map from a single byte to a list of patterns that contain that fingerprint:
```ignore
f |--> foo
b |--> bar, baz
```
Now, all we need to do is figure out how to represent this map in vector space
and use normal SIMD operations to perform a lookup. The first simplification
we can make is to represent our patterns as bit fields occupying a single
byte. This is important, because a single SIMD vector can store 16 bytes.
```ignore
f |--> 00000001
b |--> 00000010, 00000100
```
How do we perform lookup though? It turns out that SSSE3 introduced a very cool
instruction called PSHUFB. The instruction takes two SIMD vectors, `A` and `B`,
and returns a third vector `C`. All vectors are treated as 16 8-bit integers.
`C` is formed by `C[i] = A[B[i]]`. (This is a bit of a simplification, but true
for the purposes of this algorithm. For full details, see [Intel's Intrinsics
Guide][5_u].) This essentially lets us use the values in `B` to lookup values
in `A`.
If we could somehow cause `B` to contain our 16 byte block from the haystack,
and if `A` could contain our bitmasks, then we'd end up with something like
this for `A`:
```ignore
0x00 0x01 ... 0x62 ... 0x66 ... 0xFF
A = 0 0 00000110 00000001 0
```
And if `B` contains our window from our haystack, we could use shuffle to take
the values from `B` and use them to look up our bitsets in `A`. But of course,
we can't do this because `A` in the above example contains 256 bytes, which
is much larger than the size of a SIMD vector.
Nybbles to the rescue! A nybble is 4 bits. Instead of one mask to hold all of
our bitsets, we can use two masks, where one mask corresponds to the lower four
bits of our fingerprint and the other mask corresponds to the upper four bits.
So our map now looks like:
```ignore
'f' & 0xF = 0x6 |--> 00000001
'f' >> 4 = 0x6 |--> 00000111
'b' & 0xF = 0x2 |--> 00000110
'b' >> 4 = 0x6 |--> 00000111
```
Notice that the bitsets for each nybble correspond to the union of all
fingerprints that contain that nybble. For example, both `f` and `b` have the
same upper 4 bits but differ on the lower 4 bits. Putting this together, we
have `A0`, `A1` and `B`, where `A0` is our mask for the lower nybble, `A1` is
our mask for the upper nybble and `B` is our 16 byte block from the haystack:
```ignore
0x00 0x01 0x02 0x03 ... 0x06 ... 0xF
A0 = 0 0 00000110 0 00000001 0
A1 = 0 0 0 0 00000111 0
B = b a t _ t p
B = 0x62 0x61 0x74 0x20 0x74 0x70
```
But of course, we can't use `B` with `PSHUFB` yet, since its values are 8 bits,
and we need indexes that are at most 4 bits (corresponding to one of 16
values). We can apply the same transformation to split `B` into lower and upper
nybbles as we did `A`. As before, `B0` corresponds to the lower nybbles and
`B1` corresponds to the upper nybbles:
```ignore
b a t _ c a t _ f o o _ b u m p
B0 = 0x2 0x1 0x4 0x0 0x3 0x1 0x4 0x0 0x6 0xF 0xF 0x0 0x2 0x5 0xD 0x0
B1 = 0x6 0x6 0x7 0x2 0x6 0x6 0x7 0x2 0x6 0x6 0x6 0x2 0x6 0x7 0x6 0x7
```
And now we have a nice correspondence. `B0` can index `A0` and `B1` can index
`A1`. Here's what we get when we apply `C0 = PSHUFB(A0, B0)`:
```ignore
b a ... f o ... p
A0[0x2] A0[0x1] A0[0x6] A0[0xF] A0[0x0]
C0 = 00000110 0 00000001 0 0
```
And `C1 = PSHUFB(A1, B1)`:
```ignore
b a ... f o ... p
A1[0x6] A1[0x6] A1[0x6] A1[0x6] A1[0x7]
C1 = 00000111 00000111 00000111 00000111 0
```
Notice how neither one of `C0` or `C1` is guaranteed to report fully correct
results all on its own. For example, `C1` claims that `b` is a fingerprint for
the pattern `foo` (since `A1[0x6] = 00000111`), and that `o` is a fingerprint
for all of our patterns. But if we combined `C0` and `C1` with an `AND`
operation:
```ignore
b a ... f o ... p
C = 00000110 0 00000001 0 0
```
Then we now have that `C[i]` contains a bitset corresponding to the matching
fingerprints in a haystack's 16 byte block, where `i` is the `ith` byte in that
block.
Once we have that, we can look for the position of the least significant bit
in `C`. (Least significant because we only target little endian here. Thus,
the least significant bytes correspond to bytes in our haystack at a lower
address.) That position, modulo `8`, gives us the pattern that the fingerprint
matches. That position, integer divided by `8`, also gives us the byte offset
that the fingerprint occurs in inside the 16 byte haystack block. Using those
two pieces of information, we can run a verification procedure that tries
to match all substrings containing that fingerprint at that position in the
haystack.
# Implementation notes
The problem with the algorithm as described above is that it uses a single byte
for a fingerprint. This will work well if the fingerprints are rare in the
haystack (e.g., capital letters or special characters in normal English text),
but if the fingerprints are common, you'll wind up spending too much time in
the verification step, which effectively negates the performance benefits of
scanning 16 bytes at a time. Remember, the key to the performance of this
algorithm is to do as little work as possible per 16 (or 32) bytes.
This algorithm can be extrapolated in a relatively straight-forward way to use
larger fingerprints. That is, instead of a single byte prefix, we might use a
two or three byte prefix. The implementation here implements N = {1, 2, 3}
and always picks the largest N possible. The rationale is that the bigger the
fingerprint, the fewer verification steps we'll do. Of course, if N is too
large, then we'll end up doing too much on each step.
The way to extend it is:
1. Add a mask for each byte in the fingerprint. (Remember that each mask is
composed of two SIMD vectors.) This results in a value of `C` for each byte
in the fingerprint while searching.
2. When testing each 16 (or 32) byte block, each value of `C` must be shifted
so that they are aligned. Once aligned, they should all be `AND`'d together.
This will give you only the bitsets corresponding to the full match of the
fingerprint. To do this, one needs to save the last byte (for N=2) or last
two bytes (for N=3) from the previous iteration, and then line them up with
the first one or two bytes of the next iteration.
## Verification
Verification generally follows the procedure outlined above. The tricky parts
are in the right formulation of operations to get our bits out of our vectors.
We have a limited set of operations available to us on SIMD vectors as 128-bit
or 256-bit numbers, so we wind up needing to rip out 2 (or 4) 64-bit integers
from our vectors, and then run our verification step on each of those. The
verification step looks at the least significant bit set, and from its
position, we can derive the byte offset and bucket. (Again, as described
above.) Once we know the bucket, we do a fairly naive exhaustive search for
every literal in that bucket. (Hyperscan is a bit smarter here and uses a hash
table, but I haven't had time to thoroughly explore that. A few initial
half-hearted attempts resulted in worse performance.)
## AVX
The AVX version of Teddy extrapolates almost perfectly from the SSE version.
The only hickup is that PALIGNR is used to align chunks in the 16-bit version,
and there is no equivalent instruction in AVX. AVX does have VPALIGNR, but it
only works within 128-bit lanes. So there's a bit of tomfoolery to get around
this by shuffling the vectors before calling VPALIGNR.
The only other aspect to AVX is that since our masks are still fundamentally
16-bytes (0x0-0xF), they are duplicated to 32-bytes, so that they can apply to
32-byte chunks.
## Fat Teddy
In the version of Teddy described above, 8 buckets are used to group patterns
that we want to search for. However, when AVX is available, we can extend the
number of buckets to 16 by permitting each byte in our masks to use 16-bits
instead of 8-bits to represent the buckets it belongs to. (This variant is also
in Hyperscan.) However, what we give up is the ability to scan 32 bytes at a
time, even though we're using AVX. Instead, we have to scan 16 bytes at a time.
What we gain, though, is (hopefully) less work in our verification routine.
It patterns are more spread out across more buckets, then there should overall
be fewer false positives. In general, Fat Teddy permits us to grow our capacity
a bit and search for more literals before Teddy gets overwhelmed.
The tricky part of Fat Teddy is in how we adjust our masks and our verification
procedure. For the masks, we simply represent the first 8 buckets in each of
the low 16 bytes, and then the second 8 buckets in each of the high 16 bytes.
Then, in the search loop, instead of loading 32 bytes from the haystack, we
load the same 16 bytes from the haystack into both the low and high 16 byte
portions of our 256-bit vector. So for example, a mask might look like this:
bits: 00100001 00000000 ... 11000000 00000000 00000001 ... 00000000
byte: 31 30 16 15 14 0
offset: 15 14 0 15 14 0
buckets: 8-15 8-15 8-15 0-7 0-7 0-7
Where `byte` is the position in the vector (higher numbers corresponding to
more significant bits), `offset` is the corresponding position in the haystack
chunk, and `buckets` corresponds to the bucket assignments for that particular
byte.
In particular, notice that the bucket assignments for offset `0` are spread
out between bytes `0` and `16`. This works well for the chunk-by-chunk search
procedure, but verification really wants to process all bucket assignments for
each offset at once. Otherwise, we might wind up finding a match at offset
`1` in one the first 8 buckets, when we really should have reported a match
at offset `0` in one of the second 8 buckets. (Because we want the leftmost
match.)
Thus, for verification, we rearrange the above vector such that it is a
sequence of 16-bit integers, where the least significant 16-bit integer
corresponds to all of the bucket assignments for offset `0`. So with the
above vector, the least significant 16-bit integer would be
11000000 000000
which was taken from bytes `16` and `0`. Then the verification step pretty much
runs as described, except with 16 buckets instead of 8.
# References
- **[1]** [Hyperscan on GitHub](https://github.com/intel/hyperscan),
[webpage](https://www.hyperscan.io/)
- **[2a]** Ben-Kiki, O., Bille, P., Breslauer, D., Gasieniec, L., Grossi, R.,
& Weimann, O. (2011).
_Optimal packed string matching_.
In LIPIcs-Leibniz International Proceedings in Informatics (Vol. 13).
Schloss Dagstuhl-Leibniz-Zentrum fuer Informatik.
DOI: 10.4230/LIPIcs.FSTTCS.2011.423.
[PDF](https://drops.dagstuhl.de/opus/volltexte/2011/3355/pdf/37.pdf).
- **[2b]** Ben-Kiki, O., Bille, P., Breslauer, D., Ga̧sieniec, L., Grossi, R.,
& Weimann, O. (2014).
_Towards optimal packed string matching_.
Theoretical Computer Science, 525, 111-129.
DOI: 10.1016/j.tcs.2013.06.013.
[PDF](https://www.cs.haifa.ac.il/~oren/Publications/bpsm.pdf).
- **[3]** Bille, P. (2011).
_Fast searching in packed strings_.
Journal of Discrete Algorithms, 9(1), 49-56.
DOI: 10.1016/j.jda.2010.09.003.
[PDF](https://www.sciencedirect.com/science/article/pii/S1570866710000353).
- **[4a]** Faro, S., & Külekci, M. O. (2012, October).
_Fast multiple string matching using streaming SIMD extensions technology_.
In String Processing and Information Retrieval (pp. 217-228).
Springer Berlin Heidelberg.
DOI: 10.1007/978-3-642-34109-0_23.
[PDF](https://www.dmi.unict.it/faro/papers/conference/faro32.pdf).
- **[4b]** Faro, S., & Külekci, M. O. (2013, September).
_Towards a Very Fast Multiple String Matching Algorithm for Short Patterns_.
In Stringology (pp. 78-91).
[PDF](https://www.dmi.unict.it/faro/papers/conference/faro36.pdf).
- **[4c]** Faro, S., & Külekci, M. O. (2013, January).
_Fast packed string matching for short patterns_.
In Proceedings of the Meeting on Algorithm Engineering & Expermiments
(pp. 113-121).
Society for Industrial and Applied Mathematics.
[PDF](https://arxiv.org/pdf/1209.6449.pdf).
- **[4d]** Faro, S., & Külekci, M. O. (2014).
_Fast and flexible packed string matching_.
Journal of Discrete Algorithms, 28, 61-72.
DOI: 10.1016/j.jda.2014.07.003.
[1_u]: https://github.com/intel/hyperscan
[5_u]: https://software.intel.com/sites/landingpage/IntrinsicsGuide

View File

@ -0,0 +1,780 @@
use core::{
fmt::Debug,
panic::{RefUnwindSafe, UnwindSafe},
};
use alloc::sync::Arc;
use crate::packed::{ext::Pointer, pattern::Patterns, teddy::generic::Match};
/// A builder for constructing a Teddy matcher.
///
/// The builder primarily permits fine grained configuration of the Teddy
/// matcher. Most options are made only available for testing/benchmarking
/// purposes. In reality, options are automatically determined by the nature
/// and number of patterns given to the builder.
#[derive(Clone, Debug)]
pub(crate) struct Builder {
/// When none, this is automatically determined. Otherwise, `false` means
/// slim Teddy is used (8 buckets) and `true` means fat Teddy is used
/// (16 buckets). Fat Teddy requires AVX2, so if that CPU feature isn't
/// available and Fat Teddy was requested, no matcher will be built.
only_fat: Option<bool>,
/// When none, this is automatically determined. Otherwise, `false` means
/// that 128-bit vectors will be used (up to SSSE3 instructions) where as
/// `true` means that 256-bit vectors will be used. As with `fat`, if
/// 256-bit vectors are requested and they aren't available, then a
/// searcher will not be built.
only_256bit: Option<bool>,
/// When true (the default), the number of patterns will be used as a
/// heuristic for refusing construction of a Teddy searcher. The point here
/// is that too many patterns can overwhelm Teddy. But this can be disabled
/// in cases where the caller knows better.
heuristic_pattern_limits: bool,
}
impl Default for Builder {
fn default() -> Builder {
Builder::new()
}
}
impl Builder {
/// Create a new builder for configuring a Teddy matcher.
pub(crate) fn new() -> Builder {
Builder {
only_fat: None,
only_256bit: None,
heuristic_pattern_limits: true,
}
}
/// Build a matcher for the set of patterns given. If a matcher could not
/// be built, then `None` is returned.
///
/// Generally, a matcher isn't built if the necessary CPU features aren't
/// available, an unsupported target or if the searcher is believed to be
/// slower than standard techniques (i.e., if there are too many literals).
pub(crate) fn build(&self, patterns: Arc<Patterns>) -> Option<Searcher> {
self.build_imp(patterns)
}
/// Require the use of Fat (true) or Slim (false) Teddy. Fat Teddy uses
/// 16 buckets where as Slim Teddy uses 8 buckets. More buckets are useful
/// for a larger set of literals.
///
/// `None` is the default, which results in an automatic selection based
/// on the number of literals and available CPU features.
pub(crate) fn only_fat(&mut self, yes: Option<bool>) -> &mut Builder {
self.only_fat = yes;
self
}
/// Request the use of 256-bit vectors (true) or 128-bit vectors (false).
/// Generally, a larger vector size is better since it either permits
/// matching more patterns or matching more bytes in the haystack at once.
///
/// `None` is the default, which results in an automatic selection based on
/// the number of literals and available CPU features.
pub(crate) fn only_256bit(&mut self, yes: Option<bool>) -> &mut Builder {
self.only_256bit = yes;
self
}
/// Request that heuristic limitations on the number of patterns be
/// employed. This useful to disable for benchmarking where one wants to
/// explore how Teddy performs on large number of patterns even if the
/// heuristics would otherwise refuse construction.
///
/// This is enabled by default.
pub(crate) fn heuristic_pattern_limits(
&mut self,
yes: bool,
) -> &mut Builder {
self.heuristic_pattern_limits = yes;
self
}
fn build_imp(&self, patterns: Arc<Patterns>) -> Option<Searcher> {
let patlimit = self.heuristic_pattern_limits;
// There's no particular reason why we limit ourselves to little endian
// here, but it seems likely that some parts of Teddy as they are
// currently written (e.g., the uses of `trailing_zeros`) are likely
// wrong on non-little-endian targets. Such things are likely easy to
// fix, but at the time of writing (2023/09/18), I actually do not know
// how to test this code on a big-endian target. So for now, we're
// conservative and just bail out.
if !cfg!(target_endian = "little") {
debug!("skipping Teddy because target isn't little endian");
return None;
}
// Too many patterns will overwhelm Teddy and likely lead to slow
// downs, typically in the verification step.
if patlimit && patterns.len() > 64 {
debug!("skipping Teddy because of too many patterns");
return None;
}
#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
{
use self::x86_64::{FatAVX2, SlimAVX2, SlimSSSE3};
let mask_len = core::cmp::min(4, patterns.minimum_len());
let beefy = patterns.len() > 32;
let has_avx2 = self::x86_64::is_available_avx2();
let has_ssse3 = has_avx2 || self::x86_64::is_available_ssse3();
let use_avx2 = if self.only_256bit == Some(true) {
if !has_avx2 {
debug!(
"skipping Teddy because avx2 was demanded but unavailable"
);
return None;
}
true
} else if self.only_256bit == Some(false) {
if !has_ssse3 {
debug!(
"skipping Teddy because ssse3 was demanded but unavailable"
);
return None;
}
false
} else if !has_ssse3 && !has_avx2 {
debug!(
"skipping Teddy because ssse3 and avx2 are unavailable"
);
return None;
} else {
has_avx2
};
let fat = match self.only_fat {
None => use_avx2 && beefy,
Some(false) => false,
Some(true) if !use_avx2 => {
debug!(
"skipping Teddy because fat was demanded, but fat \
Teddy requires avx2 which is unavailable"
);
return None;
}
Some(true) => true,
};
// Just like for aarch64, it's possible that too many patterns will
// overhwelm Teddy. Unlike aarch64 though, we have Fat teddy which
// helps things scale a bit more by spreading patterns over more
// buckets.
//
// These thresholds were determined by looking at the measurements
// for the rust/aho-corasick/packed/leftmost-first and
// rust/aho-corasick/dfa/leftmost-first engines on the `teddy/`
// benchmarks.
if patlimit && mask_len == 1 && patterns.len() > 16 {
debug!(
"skipping Teddy (mask len: 1) because there are \
too many patterns",
);
return None;
}
match (mask_len, use_avx2, fat) {
(1, false, _) => {
debug!("Teddy choice: 128-bit slim, 1 byte");
SlimSSSE3::<1>::new(&patterns)
}
(1, true, false) => {
debug!("Teddy choice: 256-bit slim, 1 byte");
SlimAVX2::<1>::new(&patterns)
}
(1, true, true) => {
debug!("Teddy choice: 256-bit fat, 1 byte");
FatAVX2::<1>::new(&patterns)
}
(2, false, _) => {
debug!("Teddy choice: 128-bit slim, 2 bytes");
SlimSSSE3::<2>::new(&patterns)
}
(2, true, false) => {
debug!("Teddy choice: 256-bit slim, 2 bytes");
SlimAVX2::<2>::new(&patterns)
}
(2, true, true) => {
debug!("Teddy choice: 256-bit fat, 2 bytes");
FatAVX2::<2>::new(&patterns)
}
(3, false, _) => {
debug!("Teddy choice: 128-bit slim, 3 bytes");
SlimSSSE3::<3>::new(&patterns)
}
(3, true, false) => {
debug!("Teddy choice: 256-bit slim, 3 bytes");
SlimAVX2::<3>::new(&patterns)
}
(3, true, true) => {
debug!("Teddy choice: 256-bit fat, 3 bytes");
FatAVX2::<3>::new(&patterns)
}
(4, false, _) => {
debug!("Teddy choice: 128-bit slim, 4 bytes");
SlimSSSE3::<4>::new(&patterns)
}
(4, true, false) => {
debug!("Teddy choice: 256-bit slim, 4 bytes");
SlimAVX2::<4>::new(&patterns)
}
(4, true, true) => {
debug!("Teddy choice: 256-bit fat, 4 bytes");
FatAVX2::<4>::new(&patterns)
}
_ => {
debug!("no supported Teddy configuration found");
None
}
}
}
#[cfg(target_arch = "aarch64")]
{
use self::aarch64::SlimNeon;
let mask_len = core::cmp::min(4, patterns.minimum_len());
if self.only_256bit == Some(true) {
debug!(
"skipping Teddy because 256-bits were demanded \
but unavailable"
);
return None;
}
if self.only_fat == Some(true) {
debug!(
"skipping Teddy because fat was demanded but unavailable"
);
}
// Since we don't have Fat teddy in aarch64 (I think we'd want at
// least 256-bit vectors for that), we need to be careful not to
// allow too many patterns as it might overwhelm Teddy. Generally
// speaking, as the mask length goes up, the more patterns we can
// handle because the mask length results in fewer candidates
// generated.
//
// These thresholds were determined by looking at the measurements
// for the rust/aho-corasick/packed/leftmost-first and
// rust/aho-corasick/dfa/leftmost-first engines on the `teddy/`
// benchmarks.
match mask_len {
1 => {
if patlimit && patterns.len() > 16 {
debug!(
"skipping Teddy (mask len: 1) because there are \
too many patterns",
);
}
debug!("Teddy choice: 128-bit slim, 1 byte");
SlimNeon::<1>::new(&patterns)
}
2 => {
if patlimit && patterns.len() > 32 {
debug!(
"skipping Teddy (mask len: 2) because there are \
too many patterns",
);
}
debug!("Teddy choice: 128-bit slim, 2 bytes");
SlimNeon::<2>::new(&patterns)
}
3 => {
if patlimit && patterns.len() > 48 {
debug!(
"skipping Teddy (mask len: 3) because there are \
too many patterns",
);
}
debug!("Teddy choice: 128-bit slim, 3 bytes");
SlimNeon::<3>::new(&patterns)
}
4 => {
debug!("Teddy choice: 128-bit slim, 4 bytes");
SlimNeon::<4>::new(&patterns)
}
_ => {
debug!("no supported Teddy configuration found");
None
}
}
}
#[cfg(not(any(
all(target_arch = "x86_64", target_feature = "sse2"),
target_arch = "aarch64"
)))]
{
None
}
}
}
/// A searcher that dispatches to one of several possible Teddy variants.
#[derive(Clone, Debug)]
pub(crate) struct Searcher {
/// The Teddy variant we use. We use dynamic dispatch under the theory that
/// it results in better codegen then a enum, although this is a specious
/// claim.
///
/// This `Searcher` is essentially a wrapper for a `SearcherT` trait
/// object. We just make `memory_usage` and `minimum_len` available without
/// going through dynamic dispatch.
imp: Arc<dyn SearcherT>,
/// Total heap memory used by the Teddy variant.
memory_usage: usize,
/// The minimum haystack length this searcher can handle. It is intended
/// for callers to use some other search routine (such as Rabin-Karp) in
/// cases where the haystack (or remainer of the haystack) is too short.
minimum_len: usize,
}
impl Searcher {
/// Look for the leftmost occurrence of any pattern in this search in the
/// given haystack starting at the given position.
///
/// # Panics
///
/// This panics when `haystack[at..].len()` is less than the minimum length
/// for this haystack.
#[inline(always)]
pub(crate) fn find(
&self,
haystack: &[u8],
at: usize,
) -> Option<crate::Match> {
// SAFETY: The Teddy implementations all require a minimum haystack
// length, and this is required for safety. Therefore, we assert it
// here in order to make this method sound.
assert!(haystack[at..].len() >= self.minimum_len);
let hayptr = haystack.as_ptr();
// SAFETY: Construction of the searcher guarantees that we are able
// to run it in the current environment (i.e., we won't get an AVX2
// searcher on a x86-64 CPU without AVX2 support). Also, the pointers
// are valid as they are derived directly from a borrowed slice.
let teddym = unsafe {
self.imp.find(hayptr.add(at), hayptr.add(haystack.len()))?
};
let start = teddym.start().as_usize().wrapping_sub(hayptr.as_usize());
let end = teddym.end().as_usize().wrapping_sub(hayptr.as_usize());
let span = crate::Span { start, end };
// OK because we won't permit the construction of a searcher that
// could report a pattern ID bigger than what can fit in the crate-wide
// PatternID type.
let pid = crate::PatternID::new_unchecked(teddym.pattern().as_usize());
let m = crate::Match::new(pid, span);
Some(m)
}
/// Returns the approximate total amount of heap used by this type, in
/// units of bytes.
#[inline(always)]
pub(crate) fn memory_usage(&self) -> usize {
self.memory_usage
}
/// Returns the minimum length, in bytes, that a haystack must be in order
/// to use it with this searcher.
#[inline(always)]
pub(crate) fn minimum_len(&self) -> usize {
self.minimum_len
}
}
/// A trait that provides dynamic dispatch over the different possible Teddy
/// variants on the same algorithm.
///
/// On `x86_64` for example, it isn't known until runtime which of 12 possible
/// variants will be used. One might use one of the four slim 128-bit vector
/// variants, or one of the four 256-bit vector variants or even one of the
/// four fat 256-bit vector variants.
///
/// Since this choice is generally made when the Teddy searcher is constructed
/// and this choice is based on the patterns given and what the current CPU
/// supports, it follows that there must be some kind of indirection at search
/// time that "selects" the variant chosen at build time.
///
/// There are a few different ways to go about this. One approach is to use an
/// enum. It works fine, but in my experiments, this generally results in worse
/// codegen. Another approach, which is what we use here, is dynamic dispatch
/// via a trait object. We basically implement this trait for each possible
/// variant, select the variant we want at build time and convert it to a
/// trait object for use at search time.
///
/// Another approach is to use function pointers and stick each of the possible
/// variants into a union. This is essentially isomorphic to the dynamic
/// dispatch approach, but doesn't require any allocations. Since this crate
/// requires `alloc`, there's no real reason (AFAIK) to go down this path. (The
/// `memchr` crate does this.)
trait SearcherT:
Debug + Send + Sync + UnwindSafe + RefUnwindSafe + 'static
{
/// Execute a search on the given haystack (identified by `start` and `end`
/// raw pointers).
///
/// # Safety
///
/// Essentially, the `start` and `end` pointers must be valid and point
/// to a haystack one can read. As long as you derive them from, for
/// example, a `&[u8]`, they should automatically satisfy all of the safety
/// obligations:
///
/// * Both `start` and `end` must be valid for reads.
/// * Both `start` and `end` must point to an initialized value.
/// * Both `start` and `end` must point to the same allocated object and
/// must either be in bounds or at most one byte past the end of the
/// allocated object.
/// * Both `start` and `end` must be _derived from_ a pointer to the same
/// object.
/// * The distance between `start` and `end` must not overflow `isize`.
/// * The distance being in bounds must not rely on "wrapping around" the
/// address space.
/// * It must be the case that `start <= end`.
/// * `end - start` must be greater than the minimum length for this
/// searcher.
///
/// Also, it is expected that implementations of this trait will tag this
/// method with a `target_feature` attribute. Callers must ensure that
/// they are executing this method in an environment where that attribute
/// is valid.
unsafe fn find(&self, start: *const u8, end: *const u8) -> Option<Match>;
}
#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
mod x86_64 {
use core::arch::x86_64::{__m128i, __m256i};
use alloc::sync::Arc;
use crate::packed::{
ext::Pointer,
pattern::Patterns,
teddy::generic::{self, Match},
};
use super::{Searcher, SearcherT};
#[derive(Clone, Debug)]
pub(super) struct SlimSSSE3<const BYTES: usize> {
slim128: generic::Slim<__m128i, BYTES>,
}
// Defines SlimSSSE3 wrapper functions for 1, 2, 3 and 4 bytes.
macro_rules! slim_ssse3 {
($len:expr) => {
impl SlimSSSE3<$len> {
/// Creates a new searcher using "slim" Teddy with 128-bit
/// vectors. If SSSE3 is not available in the current
/// environment, then this returns `None`.
pub(super) fn new(
patterns: &Arc<Patterns>,
) -> Option<Searcher> {
if !is_available_ssse3() {
return None;
}
Some(unsafe { SlimSSSE3::<$len>::new_unchecked(patterns) })
}
/// Creates a new searcher using "slim" Teddy with 256-bit
/// vectors without checking whether SSSE3 is available or not.
///
/// # Safety
///
/// Callers must ensure that SSSE3 is available in the current
/// environment.
#[target_feature(enable = "ssse3")]
unsafe fn new_unchecked(patterns: &Arc<Patterns>) -> Searcher {
let slim128 = generic::Slim::<__m128i, $len>::new(
Arc::clone(patterns),
);
let memory_usage = slim128.memory_usage();
let minimum_len = slim128.minimum_len();
let imp = Arc::new(SlimSSSE3 { slim128 });
Searcher { imp, memory_usage, minimum_len }
}
}
impl SearcherT for SlimSSSE3<$len> {
#[target_feature(enable = "ssse3")]
#[inline]
unsafe fn find(
&self,
start: *const u8,
end: *const u8,
) -> Option<Match> {
// SAFETY: All obligations except for `target_feature` are
// passed to the caller. Our use of `target_feature` is
// safe because construction of this type requires that the
// requisite target features are available.
self.slim128.find(start, end)
}
}
};
}
slim_ssse3!(1);
slim_ssse3!(2);
slim_ssse3!(3);
slim_ssse3!(4);
#[derive(Clone, Debug)]
pub(super) struct SlimAVX2<const BYTES: usize> {
slim128: generic::Slim<__m128i, BYTES>,
slim256: generic::Slim<__m256i, BYTES>,
}
// Defines SlimAVX2 wrapper functions for 1, 2, 3 and 4 bytes.
macro_rules! slim_avx2 {
($len:expr) => {
impl SlimAVX2<$len> {
/// Creates a new searcher using "slim" Teddy with 256-bit
/// vectors. If AVX2 is not available in the current
/// environment, then this returns `None`.
pub(super) fn new(
patterns: &Arc<Patterns>,
) -> Option<Searcher> {
if !is_available_avx2() {
return None;
}
Some(unsafe { SlimAVX2::<$len>::new_unchecked(patterns) })
}
/// Creates a new searcher using "slim" Teddy with 256-bit
/// vectors without checking whether AVX2 is available or not.
///
/// # Safety
///
/// Callers must ensure that AVX2 is available in the current
/// environment.
#[target_feature(enable = "avx2")]
unsafe fn new_unchecked(patterns: &Arc<Patterns>) -> Searcher {
let slim128 = generic::Slim::<__m128i, $len>::new(
Arc::clone(&patterns),
);
let slim256 = generic::Slim::<__m256i, $len>::new(
Arc::clone(&patterns),
);
let memory_usage =
slim128.memory_usage() + slim256.memory_usage();
let minimum_len = slim128.minimum_len();
let imp = Arc::new(SlimAVX2 { slim128, slim256 });
Searcher { imp, memory_usage, minimum_len }
}
}
impl SearcherT for SlimAVX2<$len> {
#[target_feature(enable = "avx2")]
#[inline]
unsafe fn find(
&self,
start: *const u8,
end: *const u8,
) -> Option<Match> {
// SAFETY: All obligations except for `target_feature` are
// passed to the caller. Our use of `target_feature` is
// safe because construction of this type requires that the
// requisite target features are available.
let len = end.distance(start);
if len < self.slim256.minimum_len() {
self.slim128.find(start, end)
} else {
self.slim256.find(start, end)
}
}
}
};
}
slim_avx2!(1);
slim_avx2!(2);
slim_avx2!(3);
slim_avx2!(4);
#[derive(Clone, Debug)]
pub(super) struct FatAVX2<const BYTES: usize> {
fat256: generic::Fat<__m256i, BYTES>,
}
// Defines SlimAVX2 wrapper functions for 1, 2, 3 and 4 bytes.
macro_rules! fat_avx2 {
($len:expr) => {
impl FatAVX2<$len> {
/// Creates a new searcher using "slim" Teddy with 256-bit
/// vectors. If AVX2 is not available in the current
/// environment, then this returns `None`.
pub(super) fn new(
patterns: &Arc<Patterns>,
) -> Option<Searcher> {
if !is_available_avx2() {
return None;
}
Some(unsafe { FatAVX2::<$len>::new_unchecked(patterns) })
}
/// Creates a new searcher using "slim" Teddy with 256-bit
/// vectors without checking whether AVX2 is available or not.
///
/// # Safety
///
/// Callers must ensure that AVX2 is available in the current
/// environment.
#[target_feature(enable = "avx2")]
unsafe fn new_unchecked(patterns: &Arc<Patterns>) -> Searcher {
let fat256 = generic::Fat::<__m256i, $len>::new(
Arc::clone(&patterns),
);
let memory_usage = fat256.memory_usage();
let minimum_len = fat256.minimum_len();
let imp = Arc::new(FatAVX2 { fat256 });
Searcher { imp, memory_usage, minimum_len }
}
}
impl SearcherT for FatAVX2<$len> {
#[target_feature(enable = "avx2")]
#[inline]
unsafe fn find(
&self,
start: *const u8,
end: *const u8,
) -> Option<Match> {
// SAFETY: All obligations except for `target_feature` are
// passed to the caller. Our use of `target_feature` is
// safe because construction of this type requires that the
// requisite target features are available.
self.fat256.find(start, end)
}
}
};
}
fat_avx2!(1);
fat_avx2!(2);
fat_avx2!(3);
fat_avx2!(4);
#[inline]
pub(super) fn is_available_ssse3() -> bool {
#[cfg(not(target_feature = "sse2"))]
{
false
}
#[cfg(target_feature = "sse2")]
{
#[cfg(target_feature = "ssse3")]
{
true
}
#[cfg(not(target_feature = "ssse3"))]
{
#[cfg(feature = "std")]
{
std::is_x86_feature_detected!("ssse3")
}
#[cfg(not(feature = "std"))]
{
false
}
}
}
}
#[inline]
pub(super) fn is_available_avx2() -> bool {
#[cfg(not(target_feature = "sse2"))]
{
false
}
#[cfg(target_feature = "sse2")]
{
#[cfg(target_feature = "avx2")]
{
true
}
#[cfg(not(target_feature = "avx2"))]
{
#[cfg(feature = "std")]
{
std::is_x86_feature_detected!("avx2")
}
#[cfg(not(feature = "std"))]
{
false
}
}
}
}
}
#[cfg(target_arch = "aarch64")]
mod aarch64 {
use core::arch::aarch64::uint8x16_t;
use alloc::sync::Arc;
use crate::packed::{
pattern::Patterns,
teddy::generic::{self, Match},
};
use super::{Searcher, SearcherT};
#[derive(Clone, Debug)]
pub(super) struct SlimNeon<const BYTES: usize> {
slim128: generic::Slim<uint8x16_t, BYTES>,
}
// Defines SlimSSSE3 wrapper functions for 1, 2, 3 and 4 bytes.
macro_rules! slim_neon {
($len:expr) => {
impl SlimNeon<$len> {
/// Creates a new searcher using "slim" Teddy with 128-bit
/// vectors. If SSSE3 is not available in the current
/// environment, then this returns `None`.
pub(super) fn new(
patterns: &Arc<Patterns>,
) -> Option<Searcher> {
Some(unsafe { SlimNeon::<$len>::new_unchecked(patterns) })
}
/// Creates a new searcher using "slim" Teddy with 256-bit
/// vectors without checking whether SSSE3 is available or not.
///
/// # Safety
///
/// Callers must ensure that SSSE3 is available in the current
/// environment.
#[target_feature(enable = "neon")]
unsafe fn new_unchecked(patterns: &Arc<Patterns>) -> Searcher {
let slim128 = generic::Slim::<uint8x16_t, $len>::new(
Arc::clone(patterns),
);
let memory_usage = slim128.memory_usage();
let minimum_len = slim128.minimum_len();
let imp = Arc::new(SlimNeon { slim128 });
Searcher { imp, memory_usage, minimum_len }
}
}
impl SearcherT for SlimNeon<$len> {
#[target_feature(enable = "neon")]
#[inline]
unsafe fn find(
&self,
start: *const u8,
end: *const u8,
) -> Option<Match> {
// SAFETY: All obligations except for `target_feature` are
// passed to the caller. Our use of `target_feature` is
// safe because construction of this type requires that the
// requisite target features are available.
self.slim128.find(start, end)
}
}
};
}
slim_neon!(1);
slim_neon!(2);
slim_neon!(3);
slim_neon!(4);
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,9 @@
// Regrettable, but Teddy stuff just isn't used on all targets. And for some
// targets, like aarch64, only "slim" Teddy is used and so "fat" Teddy gets a
// bunch of dead-code warnings. Just not worth trying to squash them. Blech.
#![allow(dead_code)]
pub(crate) use self::builder::{Builder, Searcher};
mod builder;
mod generic;

583
vendor/aho-corasick/src/packed/tests.rs vendored Normal file
View File

@ -0,0 +1,583 @@
use std::collections::HashMap;
use alloc::{
format,
string::{String, ToString},
vec,
vec::Vec,
};
use crate::{
packed::{Config, MatchKind},
util::search::Match,
};
/// A description of a single test against a multi-pattern searcher.
///
/// A single test may not necessarily pass on every configuration of a
/// searcher. The tests are categorized and grouped appropriately below.
#[derive(Clone, Debug, Eq, PartialEq)]
struct SearchTest {
/// The name of this test, for debugging.
name: &'static str,
/// The patterns to search for.
patterns: &'static [&'static str],
/// The text to search.
haystack: &'static str,
/// Each match is a triple of (pattern_index, start, end), where
/// pattern_index is an index into `patterns` and `start`/`end` are indices
/// into `haystack`.
matches: &'static [(usize, usize, usize)],
}
struct SearchTestOwned {
offset: usize,
name: String,
patterns: Vec<String>,
haystack: String,
matches: Vec<(usize, usize, usize)>,
}
impl SearchTest {
fn variations(&self) -> Vec<SearchTestOwned> {
let count = if cfg!(miri) { 1 } else { 261 };
let mut tests = vec![];
for i in 0..count {
tests.push(self.offset_prefix(i));
tests.push(self.offset_suffix(i));
tests.push(self.offset_both(i));
}
tests
}
fn offset_both(&self, off: usize) -> SearchTestOwned {
SearchTestOwned {
offset: off,
name: self.name.to_string(),
patterns: self.patterns.iter().map(|s| s.to_string()).collect(),
haystack: format!(
"{}{}{}",
"Z".repeat(off),
self.haystack,
"Z".repeat(off)
),
matches: self
.matches
.iter()
.map(|&(id, s, e)| (id, s + off, e + off))
.collect(),
}
}
fn offset_prefix(&self, off: usize) -> SearchTestOwned {
SearchTestOwned {
offset: off,
name: self.name.to_string(),
patterns: self.patterns.iter().map(|s| s.to_string()).collect(),
haystack: format!("{}{}", "Z".repeat(off), self.haystack),
matches: self
.matches
.iter()
.map(|&(id, s, e)| (id, s + off, e + off))
.collect(),
}
}
fn offset_suffix(&self, off: usize) -> SearchTestOwned {
SearchTestOwned {
offset: off,
name: self.name.to_string(),
patterns: self.patterns.iter().map(|s| s.to_string()).collect(),
haystack: format!("{}{}", self.haystack, "Z".repeat(off)),
matches: self.matches.to_vec(),
}
}
}
/// Short-hand constructor for SearchTest. We use it a lot below.
macro_rules! t {
($name:ident, $patterns:expr, $haystack:expr, $matches:expr) => {
SearchTest {
name: stringify!($name),
patterns: $patterns,
haystack: $haystack,
matches: $matches,
}
};
}
/// A collection of test groups.
type TestCollection = &'static [&'static [SearchTest]];
// Define several collections corresponding to the different type of match
// semantics supported. These collections have some overlap, but each
// collection should have some tests that no other collection has.
/// Tests for leftmost-first match semantics.
const PACKED_LEFTMOST_FIRST: TestCollection =
&[BASICS, LEFTMOST, LEFTMOST_FIRST, REGRESSION, TEDDY];
/// Tests for leftmost-longest match semantics.
const PACKED_LEFTMOST_LONGEST: TestCollection =
&[BASICS, LEFTMOST, LEFTMOST_LONGEST, REGRESSION, TEDDY];
// Now define the individual tests that make up the collections above.
/// A collection of tests for the that should always be true regardless of
/// match semantics. That is, all combinations of leftmost-{first, longest}
/// should produce the same answer.
const BASICS: &'static [SearchTest] = &[
t!(basic001, &["a"], "", &[]),
t!(basic010, &["a"], "a", &[(0, 0, 1)]),
t!(basic020, &["a"], "aa", &[(0, 0, 1), (0, 1, 2)]),
t!(basic030, &["a"], "aaa", &[(0, 0, 1), (0, 1, 2), (0, 2, 3)]),
t!(basic040, &["a"], "aba", &[(0, 0, 1), (0, 2, 3)]),
t!(basic050, &["a"], "bba", &[(0, 2, 3)]),
t!(basic060, &["a"], "bbb", &[]),
t!(basic070, &["a"], "bababbbba", &[(0, 1, 2), (0, 3, 4), (0, 8, 9)]),
t!(basic100, &["aa"], "", &[]),
t!(basic110, &["aa"], "aa", &[(0, 0, 2)]),
t!(basic120, &["aa"], "aabbaa", &[(0, 0, 2), (0, 4, 6)]),
t!(basic130, &["aa"], "abbab", &[]),
t!(basic140, &["aa"], "abbabaa", &[(0, 5, 7)]),
t!(basic150, &["aaa"], "aaa", &[(0, 0, 3)]),
t!(basic200, &["abc"], "abc", &[(0, 0, 3)]),
t!(basic210, &["abc"], "zazabzabcz", &[(0, 6, 9)]),
t!(basic220, &["abc"], "zazabczabcz", &[(0, 3, 6), (0, 7, 10)]),
t!(basic230, &["abcd"], "abcd", &[(0, 0, 4)]),
t!(basic240, &["abcd"], "zazabzabcdz", &[(0, 6, 10)]),
t!(basic250, &["abcd"], "zazabcdzabcdz", &[(0, 3, 7), (0, 8, 12)]),
t!(basic300, &["a", "b"], "", &[]),
t!(basic310, &["a", "b"], "z", &[]),
t!(basic320, &["a", "b"], "b", &[(1, 0, 1)]),
t!(basic330, &["a", "b"], "a", &[(0, 0, 1)]),
t!(
basic340,
&["a", "b"],
"abba",
&[(0, 0, 1), (1, 1, 2), (1, 2, 3), (0, 3, 4),]
),
t!(
basic350,
&["b", "a"],
"abba",
&[(1, 0, 1), (0, 1, 2), (0, 2, 3), (1, 3, 4),]
),
t!(basic360, &["abc", "bc"], "xbc", &[(1, 1, 3),]),
t!(basic400, &["foo", "bar"], "", &[]),
t!(basic410, &["foo", "bar"], "foobar", &[(0, 0, 3), (1, 3, 6),]),
t!(basic420, &["foo", "bar"], "barfoo", &[(1, 0, 3), (0, 3, 6),]),
t!(basic430, &["foo", "bar"], "foofoo", &[(0, 0, 3), (0, 3, 6),]),
t!(basic440, &["foo", "bar"], "barbar", &[(1, 0, 3), (1, 3, 6),]),
t!(basic450, &["foo", "bar"], "bafofoo", &[(0, 4, 7),]),
t!(basic460, &["bar", "foo"], "bafofoo", &[(1, 4, 7),]),
t!(basic470, &["foo", "bar"], "fobabar", &[(1, 4, 7),]),
t!(basic480, &["bar", "foo"], "fobabar", &[(0, 4, 7),]),
t!(basic700, &["yabcdef", "abcdezghi"], "yabcdefghi", &[(0, 0, 7),]),
t!(basic710, &["yabcdef", "abcdezghi"], "yabcdezghi", &[(1, 1, 10),]),
t!(
basic720,
&["yabcdef", "bcdeyabc", "abcdezghi"],
"yabcdezghi",
&[(2, 1, 10),]
),
t!(basic810, &["abcd", "bcd", "cd"], "abcd", &[(0, 0, 4),]),
t!(basic820, &["bcd", "cd", "abcd"], "abcd", &[(2, 0, 4),]),
t!(basic830, &["abc", "bc"], "zazabcz", &[(0, 3, 6),]),
t!(
basic840,
&["ab", "ba"],
"abababa",
&[(0, 0, 2), (0, 2, 4), (0, 4, 6),]
),
t!(basic850, &["foo", "foo"], "foobarfoo", &[(0, 0, 3), (0, 6, 9),]),
];
/// Tests for leftmost match semantics. These should pass for both
/// leftmost-first and leftmost-longest match kinds. Stated differently, among
/// ambiguous matches, the longest match and the match that appeared first when
/// constructing the automaton should always be the same.
const LEFTMOST: &'static [SearchTest] = &[
t!(leftmost000, &["ab", "ab"], "abcd", &[(0, 0, 2)]),
t!(leftmost030, &["a", "ab"], "aa", &[(0, 0, 1), (0, 1, 2)]),
t!(leftmost031, &["ab", "a"], "aa", &[(1, 0, 1), (1, 1, 2)]),
t!(leftmost032, &["ab", "a"], "xayabbbz", &[(1, 1, 2), (0, 3, 5)]),
t!(leftmost300, &["abcd", "bce", "b"], "abce", &[(1, 1, 4)]),
t!(leftmost310, &["abcd", "ce", "bc"], "abce", &[(2, 1, 3)]),
t!(leftmost320, &["abcd", "bce", "ce", "b"], "abce", &[(1, 1, 4)]),
t!(leftmost330, &["abcd", "bce", "cz", "bc"], "abcz", &[(3, 1, 3)]),
t!(leftmost340, &["bce", "cz", "bc"], "bcz", &[(2, 0, 2)]),
t!(leftmost350, &["abc", "bd", "ab"], "abd", &[(2, 0, 2)]),
t!(
leftmost360,
&["abcdefghi", "hz", "abcdefgh"],
"abcdefghz",
&[(2, 0, 8),]
),
t!(
leftmost370,
&["abcdefghi", "cde", "hz", "abcdefgh"],
"abcdefghz",
&[(3, 0, 8),]
),
t!(
leftmost380,
&["abcdefghi", "hz", "abcdefgh", "a"],
"abcdefghz",
&[(2, 0, 8),]
),
t!(
leftmost390,
&["b", "abcdefghi", "hz", "abcdefgh"],
"abcdefghz",
&[(3, 0, 8),]
),
t!(
leftmost400,
&["h", "abcdefghi", "hz", "abcdefgh"],
"abcdefghz",
&[(3, 0, 8),]
),
t!(
leftmost410,
&["z", "abcdefghi", "hz", "abcdefgh"],
"abcdefghz",
&[(3, 0, 8), (0, 8, 9),]
),
];
/// Tests for non-overlapping leftmost-first match semantics. These tests
/// should generally be specific to leftmost-first, which means they should
/// generally fail under leftmost-longest semantics.
const LEFTMOST_FIRST: &'static [SearchTest] = &[
t!(leftfirst000, &["ab", "abcd"], "abcd", &[(0, 0, 2)]),
t!(leftfirst020, &["abcd", "ab"], "abcd", &[(0, 0, 4)]),
t!(leftfirst030, &["ab", "ab"], "abcd", &[(0, 0, 2)]),
t!(leftfirst040, &["a", "ab"], "xayabbbz", &[(0, 1, 2), (0, 3, 4)]),
t!(leftfirst100, &["abcdefg", "bcde", "bcdef"], "abcdef", &[(1, 1, 5)]),
t!(leftfirst110, &["abcdefg", "bcdef", "bcde"], "abcdef", &[(1, 1, 6)]),
t!(leftfirst300, &["abcd", "b", "bce"], "abce", &[(1, 1, 2)]),
t!(
leftfirst310,
&["abcd", "b", "bce", "ce"],
"abce",
&[(1, 1, 2), (3, 2, 4),]
),
t!(
leftfirst320,
&["a", "abcdefghi", "hz", "abcdefgh"],
"abcdefghz",
&[(0, 0, 1), (2, 7, 9),]
),
t!(leftfirst330, &["a", "abab"], "abab", &[(0, 0, 1), (0, 2, 3)]),
t!(
leftfirst340,
&["abcdef", "x", "x", "x", "x", "x", "x", "abcde"],
"abcdef",
&[(0, 0, 6)]
),
];
/// Tests for non-overlapping leftmost-longest match semantics. These tests
/// should generally be specific to leftmost-longest, which means they should
/// generally fail under leftmost-first semantics.
const LEFTMOST_LONGEST: &'static [SearchTest] = &[
t!(leftlong000, &["ab", "abcd"], "abcd", &[(1, 0, 4)]),
t!(leftlong010, &["abcd", "bcd", "cd", "b"], "abcd", &[(0, 0, 4),]),
t!(leftlong040, &["a", "ab"], "a", &[(0, 0, 1)]),
t!(leftlong050, &["a", "ab"], "ab", &[(1, 0, 2)]),
t!(leftlong060, &["ab", "a"], "a", &[(1, 0, 1)]),
t!(leftlong070, &["ab", "a"], "ab", &[(0, 0, 2)]),
t!(leftlong100, &["abcdefg", "bcde", "bcdef"], "abcdef", &[(2, 1, 6)]),
t!(leftlong110, &["abcdefg", "bcdef", "bcde"], "abcdef", &[(1, 1, 6)]),
t!(leftlong300, &["abcd", "b", "bce"], "abce", &[(2, 1, 4)]),
t!(
leftlong310,
&["a", "abcdefghi", "hz", "abcdefgh"],
"abcdefghz",
&[(3, 0, 8),]
),
t!(leftlong320, &["a", "abab"], "abab", &[(1, 0, 4)]),
t!(leftlong330, &["abcd", "b", "ce"], "abce", &[(1, 1, 2), (2, 2, 4),]),
t!(leftlong340, &["a", "ab"], "xayabbbz", &[(0, 1, 2), (1, 3, 5)]),
];
/// Regression tests that are applied to all combinations.
///
/// If regression tests are needed for specific match semantics, then add them
/// to the appropriate group above.
const REGRESSION: &'static [SearchTest] = &[
t!(regression010, &["inf", "ind"], "infind", &[(0, 0, 3), (1, 3, 6),]),
t!(regression020, &["ind", "inf"], "infind", &[(1, 0, 3), (0, 3, 6),]),
t!(
regression030,
&["libcore/", "libstd/"],
"libcore/char/methods.rs",
&[(0, 0, 8),]
),
t!(
regression040,
&["libstd/", "libcore/"],
"libcore/char/methods.rs",
&[(1, 0, 8),]
),
t!(
regression050,
&["\x00\x00\x01", "\x00\x00\x00"],
"\x00\x00\x00",
&[(1, 0, 3),]
),
t!(
regression060,
&["\x00\x00\x00", "\x00\x00\x01"],
"\x00\x00\x00",
&[(0, 0, 3),]
),
];
const TEDDY: &'static [SearchTest] = &[
t!(
teddy010,
&["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"],
"abcdefghijk",
&[
(0, 0, 1),
(1, 1, 2),
(2, 2, 3),
(3, 3, 4),
(4, 4, 5),
(5, 5, 6),
(6, 6, 7),
(7, 7, 8),
(8, 8, 9),
(9, 9, 10),
(10, 10, 11)
]
),
t!(
teddy020,
&["ab", "bc", "cd", "de", "ef", "fg", "gh", "hi", "ij", "jk", "kl"],
"abcdefghijk",
&[(0, 0, 2), (2, 2, 4), (4, 4, 6), (6, 6, 8), (8, 8, 10),]
),
t!(
teddy030,
&["abc"],
"abcdefghijklmnopqrstuvwxyzabcdefghijk",
&[(0, 0, 3), (0, 26, 29)]
),
];
// Now define a test for each combination of things above that we want to run.
// Since there are a few different combinations for each collection of tests,
// we define a couple of macros to avoid repetition drudgery. The testconfig
// macro constructs the automaton from a given match kind, and runs the search
// tests one-by-one over the given collection. The `with` parameter allows one
// to configure the config with additional parameters. The testcombo macro
// invokes testconfig in precisely this way: it sets up several tests where
// each one turns a different knob on Config.
macro_rules! testconfig {
($name:ident, $collection:expr, $with:expr) => {
#[test]
fn $name() {
run_search_tests($collection, |test| {
let mut config = Config::new();
$with(&mut config);
let mut builder = config.builder();
builder.extend(test.patterns.iter().map(|p| p.as_bytes()));
let searcher = match builder.build() {
Some(searcher) => searcher,
None => {
// For x86-64 and aarch64, not building a searcher is
// probably a bug, so be loud.
if cfg!(any(
target_arch = "x86_64",
target_arch = "aarch64"
)) {
panic!("failed to build packed searcher")
}
return None;
}
};
Some(searcher.find_iter(&test.haystack).collect())
});
}
};
}
testconfig!(
search_default_leftmost_first,
PACKED_LEFTMOST_FIRST,
|_: &mut Config| {}
);
testconfig!(
search_default_leftmost_longest,
PACKED_LEFTMOST_LONGEST,
|c: &mut Config| {
c.match_kind(MatchKind::LeftmostLongest);
}
);
testconfig!(
search_teddy_leftmost_first,
PACKED_LEFTMOST_FIRST,
|c: &mut Config| {
c.only_teddy(true);
}
);
testconfig!(
search_teddy_leftmost_longest,
PACKED_LEFTMOST_LONGEST,
|c: &mut Config| {
c.only_teddy(true).match_kind(MatchKind::LeftmostLongest);
}
);
testconfig!(
search_teddy_ssse3_leftmost_first,
PACKED_LEFTMOST_FIRST,
|c: &mut Config| {
c.only_teddy(true);
#[cfg(target_arch = "x86_64")]
if std::is_x86_feature_detected!("ssse3") {
c.only_teddy_256bit(Some(false));
}
}
);
testconfig!(
search_teddy_ssse3_leftmost_longest,
PACKED_LEFTMOST_LONGEST,
|c: &mut Config| {
c.only_teddy(true).match_kind(MatchKind::LeftmostLongest);
#[cfg(target_arch = "x86_64")]
if std::is_x86_feature_detected!("ssse3") {
c.only_teddy_256bit(Some(false));
}
}
);
testconfig!(
search_teddy_avx2_leftmost_first,
PACKED_LEFTMOST_FIRST,
|c: &mut Config| {
c.only_teddy(true);
#[cfg(target_arch = "x86_64")]
if std::is_x86_feature_detected!("avx2") {
c.only_teddy_256bit(Some(true));
}
}
);
testconfig!(
search_teddy_avx2_leftmost_longest,
PACKED_LEFTMOST_LONGEST,
|c: &mut Config| {
c.only_teddy(true).match_kind(MatchKind::LeftmostLongest);
#[cfg(target_arch = "x86_64")]
if std::is_x86_feature_detected!("avx2") {
c.only_teddy_256bit(Some(true));
}
}
);
testconfig!(
search_teddy_fat_leftmost_first,
PACKED_LEFTMOST_FIRST,
|c: &mut Config| {
c.only_teddy(true);
#[cfg(target_arch = "x86_64")]
if std::is_x86_feature_detected!("avx2") {
c.only_teddy_fat(Some(true));
}
}
);
testconfig!(
search_teddy_fat_leftmost_longest,
PACKED_LEFTMOST_LONGEST,
|c: &mut Config| {
c.only_teddy(true).match_kind(MatchKind::LeftmostLongest);
#[cfg(target_arch = "x86_64")]
if std::is_x86_feature_detected!("avx2") {
c.only_teddy_fat(Some(true));
}
}
);
testconfig!(
search_rabinkarp_leftmost_first,
PACKED_LEFTMOST_FIRST,
|c: &mut Config| {
c.only_rabin_karp(true);
}
);
testconfig!(
search_rabinkarp_leftmost_longest,
PACKED_LEFTMOST_LONGEST,
|c: &mut Config| {
c.only_rabin_karp(true).match_kind(MatchKind::LeftmostLongest);
}
);
#[test]
fn search_tests_have_unique_names() {
let assert = |constname, tests: &[SearchTest]| {
let mut seen = HashMap::new(); // map from test name to position
for (i, test) in tests.iter().enumerate() {
if !seen.contains_key(test.name) {
seen.insert(test.name, i);
} else {
let last = seen[test.name];
panic!(
"{} tests have duplicate names at positions {} and {}",
constname, last, i
);
}
}
};
assert("BASICS", BASICS);
assert("LEFTMOST", LEFTMOST);
assert("LEFTMOST_FIRST", LEFTMOST_FIRST);
assert("LEFTMOST_LONGEST", LEFTMOST_LONGEST);
assert("REGRESSION", REGRESSION);
assert("TEDDY", TEDDY);
}
fn run_search_tests<F: FnMut(&SearchTestOwned) -> Option<Vec<Match>>>(
which: TestCollection,
mut f: F,
) {
let get_match_triples =
|matches: Vec<Match>| -> Vec<(usize, usize, usize)> {
matches
.into_iter()
.map(|m| (m.pattern().as_usize(), m.start(), m.end()))
.collect()
};
for &tests in which {
for spec in tests {
for test in spec.variations() {
let results = match f(&test) {
None => continue,
Some(results) => results,
};
assert_eq!(
test.matches,
get_match_triples(results).as_slice(),
"test: {}, patterns: {:?}, haystack(len={:?}): {:?}, \
offset: {:?}",
test.name,
test.patterns,
test.haystack.len(),
test.haystack,
test.offset,
);
}
}
}
}

1752
vendor/aho-corasick/src/packed/vector.rs vendored Normal file

File diff suppressed because it is too large Load Diff

1664
vendor/aho-corasick/src/tests.rs vendored Normal file

File diff suppressed because it is too large Load Diff

270
vendor/aho-corasick/src/transducer.rs vendored Normal file
View File

@ -0,0 +1,270 @@
/*!
Provides implementations of `fst::Automaton` for Aho-Corasick automata.
This works by providing two wrapper types, [`Anchored`] and [`Unanchored`].
The former executes an anchored search on an FST while the latter executes
an unanchored search. Building these wrappers is fallible and will fail if
the underlying Aho-Corasick automaton does not support the type of search it
represents.
*/
use crate::{
automaton::{Automaton, StateID},
Anchored as AcAnchored, Input, MatchError,
};
/// Represents an unanchored Aho-Corasick search of a finite state transducer.
///
/// Wrapping an Aho-Corasick automaton in `Unanchored` will fail if the
/// underlying automaton does not support unanchored searches.
///
/// # Example
///
/// This shows how to build an FST of keys and then run an unanchored search on
/// those keys using an Aho-Corasick automaton.
///
/// ```
/// use aho_corasick::{nfa::contiguous::NFA, transducer::Unanchored};
/// use fst::{Automaton, IntoStreamer, Set, Streamer};
///
/// let set = Set::from_iter(&["abcd", "bc", "bcd", "xyz"]).unwrap();
/// let nfa = NFA::new(&["bcd", "x"]).unwrap();
/// // NFAs always support both unanchored and anchored searches.
/// let searcher = Unanchored::new(&nfa).unwrap();
///
/// let mut stream = set.search(searcher).into_stream();
/// let mut results = vec![];
/// while let Some(key) = stream.next() {
/// results.push(std::str::from_utf8(key).unwrap().to_string());
/// }
/// assert_eq!(vec!["abcd", "bcd", "xyz"], results);
/// ```
#[derive(Clone, Debug)]
pub struct Unanchored<A>(A);
impl<A: Automaton> Unanchored<A> {
/// Create a new `Unanchored` implementation of the `fst::Automaton` trait.
///
/// If the given Aho-Corasick automaton does not support unanchored
/// searches, then this returns an error.
pub fn new(aut: A) -> Result<Unanchored<A>, MatchError> {
let input = Input::new("").anchored(AcAnchored::No);
let _ = aut.start_state(&input)?;
Ok(Unanchored(aut))
}
/// Returns a borrow to the underlying automaton.
pub fn as_ref(&self) -> &A {
&self.0
}
/// Unwrap this value and return the inner automaton.
pub fn into_inner(self) -> A {
self.0
}
}
impl<A: Automaton> fst::Automaton for Unanchored<A> {
type State = StateID;
#[inline]
fn start(&self) -> StateID {
let input = Input::new("").anchored(AcAnchored::No);
self.0.start_state(&input).expect("support for unanchored searches")
}
#[inline]
fn is_match(&self, state: &StateID) -> bool {
self.0.is_match(*state)
}
#[inline]
fn accept(&self, state: &StateID, byte: u8) -> StateID {
if fst::Automaton::is_match(self, state) {
return *state;
}
self.0.next_state(AcAnchored::No, *state, byte)
}
#[inline]
fn can_match(&self, state: &StateID) -> bool {
!self.0.is_dead(*state)
}
}
/// Represents an anchored Aho-Corasick search of a finite state transducer.
///
/// Wrapping an Aho-Corasick automaton in `Unanchored` will fail if the
/// underlying automaton does not support unanchored searches.
///
/// # Example
///
/// This shows how to build an FST of keys and then run an anchored search on
/// those keys using an Aho-Corasick automaton.
///
/// ```
/// use aho_corasick::{nfa::contiguous::NFA, transducer::Anchored};
/// use fst::{Automaton, IntoStreamer, Set, Streamer};
///
/// let set = Set::from_iter(&["abcd", "bc", "bcd", "xyz"]).unwrap();
/// let nfa = NFA::new(&["bcd", "x"]).unwrap();
/// // NFAs always support both unanchored and anchored searches.
/// let searcher = Anchored::new(&nfa).unwrap();
///
/// let mut stream = set.search(searcher).into_stream();
/// let mut results = vec![];
/// while let Some(key) = stream.next() {
/// results.push(std::str::from_utf8(key).unwrap().to_string());
/// }
/// assert_eq!(vec!["bcd", "xyz"], results);
/// ```
///
/// This is like the example above, except we use an Aho-Corasick DFA, which
/// requires explicitly configuring it to support anchored searches. (NFAs
/// unconditionally support both unanchored and anchored searches.)
///
/// ```
/// use aho_corasick::{dfa::DFA, transducer::Anchored, StartKind};
/// use fst::{Automaton, IntoStreamer, Set, Streamer};
///
/// let set = Set::from_iter(&["abcd", "bc", "bcd", "xyz"]).unwrap();
/// let dfa = DFA::builder()
/// .start_kind(StartKind::Anchored)
/// .build(&["bcd", "x"])
/// .unwrap();
/// // We've explicitly configured our DFA to support anchored searches.
/// let searcher = Anchored::new(&dfa).unwrap();
///
/// let mut stream = set.search(searcher).into_stream();
/// let mut results = vec![];
/// while let Some(key) = stream.next() {
/// results.push(std::str::from_utf8(key).unwrap().to_string());
/// }
/// assert_eq!(vec!["bcd", "xyz"], results);
/// ```
#[derive(Clone, Debug)]
pub struct Anchored<A>(A);
impl<A: Automaton> Anchored<A> {
/// Create a new `Anchored` implementation of the `fst::Automaton` trait.
///
/// If the given Aho-Corasick automaton does not support anchored searches,
/// then this returns an error.
pub fn new(aut: A) -> Result<Anchored<A>, MatchError> {
let input = Input::new("").anchored(AcAnchored::Yes);
let _ = aut.start_state(&input)?;
Ok(Anchored(aut))
}
/// Returns a borrow to the underlying automaton.
pub fn as_ref(&self) -> &A {
&self.0
}
/// Unwrap this value and return the inner automaton.
pub fn into_inner(self) -> A {
self.0
}
}
impl<A: Automaton> fst::Automaton for Anchored<A> {
type State = StateID;
#[inline]
fn start(&self) -> StateID {
let input = Input::new("").anchored(AcAnchored::Yes);
self.0.start_state(&input).expect("support for unanchored searches")
}
#[inline]
fn is_match(&self, state: &StateID) -> bool {
self.0.is_match(*state)
}
#[inline]
fn accept(&self, state: &StateID, byte: u8) -> StateID {
if fst::Automaton::is_match(self, state) {
return *state;
}
self.0.next_state(AcAnchored::Yes, *state, byte)
}
#[inline]
fn can_match(&self, state: &StateID) -> bool {
!self.0.is_dead(*state)
}
}
#[cfg(test)]
mod tests {
use alloc::{string::String, vec, vec::Vec};
use fst::{Automaton, IntoStreamer, Set, Streamer};
use crate::{
dfa::DFA,
nfa::{contiguous, noncontiguous},
StartKind,
};
use super::*;
fn search<A: Automaton, D: AsRef<[u8]>>(
set: &Set<D>,
aut: A,
) -> Vec<String> {
let mut stream = set.search(aut).into_stream();
let mut results = vec![];
while let Some(key) = stream.next() {
results.push(String::from(core::str::from_utf8(key).unwrap()));
}
results
}
#[test]
fn unanchored() {
let set =
Set::from_iter(&["a", "bar", "baz", "wat", "xba", "xbax", "z"])
.unwrap();
let patterns = vec!["baz", "bax"];
let expected = vec!["baz", "xbax"];
let aut = Unanchored(noncontiguous::NFA::new(&patterns).unwrap());
let got = search(&set, &aut);
assert_eq!(got, expected);
let aut = Unanchored(contiguous::NFA::new(&patterns).unwrap());
let got = search(&set, &aut);
assert_eq!(got, expected);
let aut = Unanchored(DFA::new(&patterns).unwrap());
let got = search(&set, &aut);
assert_eq!(got, expected);
}
#[test]
fn anchored() {
let set =
Set::from_iter(&["a", "bar", "baz", "wat", "xba", "xbax", "z"])
.unwrap();
let patterns = vec!["baz", "bax"];
let expected = vec!["baz"];
let aut = Anchored(noncontiguous::NFA::new(&patterns).unwrap());
let got = search(&set, &aut);
assert_eq!(got, expected);
let aut = Anchored(contiguous::NFA::new(&patterns).unwrap());
let got = search(&set, &aut);
assert_eq!(got, expected);
let aut = Anchored(
DFA::builder()
.start_kind(StartKind::Anchored)
.build(&patterns)
.unwrap(),
);
let got = search(&set, &aut);
assert_eq!(got, expected);
}
}

409
vendor/aho-corasick/src/util/alphabet.rs vendored Normal file
View File

@ -0,0 +1,409 @@
use crate::util::int::Usize;
/// A representation of byte oriented equivalence classes.
///
/// This is used in finite state machines to reduce the size of the transition
/// table. This can have a particularly large impact not only on the total size
/// of an FSM, but also on FSM build times because it reduces the number of
/// transitions that need to be visited/set.
#[derive(Clone, Copy)]
pub(crate) struct ByteClasses([u8; 256]);
impl ByteClasses {
/// Creates a new set of equivalence classes where all bytes are mapped to
/// the same class.
pub(crate) fn empty() -> ByteClasses {
ByteClasses([0; 256])
}
/// Creates a new set of equivalence classes where each byte belongs to
/// its own equivalence class.
pub(crate) fn singletons() -> ByteClasses {
let mut classes = ByteClasses::empty();
for b in 0..=255 {
classes.set(b, b);
}
classes
}
/// Set the equivalence class for the given byte.
#[inline]
pub(crate) fn set(&mut self, byte: u8, class: u8) {
self.0[usize::from(byte)] = class;
}
/// Get the equivalence class for the given byte.
#[inline]
pub(crate) fn get(&self, byte: u8) -> u8 {
self.0[usize::from(byte)]
}
/// Return the total number of elements in the alphabet represented by
/// these equivalence classes. Equivalently, this returns the total number
/// of equivalence classes.
#[inline]
pub(crate) fn alphabet_len(&self) -> usize {
// Add one since the number of equivalence classes is one bigger than
// the last one.
usize::from(self.0[255]) + 1
}
/// Returns the stride, as a base-2 exponent, required for these
/// equivalence classes.
///
/// The stride is always the smallest power of 2 that is greater than or
/// equal to the alphabet length. This is done so that converting between
/// state IDs and indices can be done with shifts alone, which is much
/// faster than integer division. The "stride2" is the exponent. i.e.,
/// `2^stride2 = stride`.
pub(crate) fn stride2(&self) -> usize {
let zeros = self.alphabet_len().next_power_of_two().trailing_zeros();
usize::try_from(zeros).unwrap()
}
/// Returns the stride for these equivalence classes, which corresponds
/// to the smallest power of 2 greater than or equal to the number of
/// equivalence classes.
pub(crate) fn stride(&self) -> usize {
1 << self.stride2()
}
/// Returns true if and only if every byte in this class maps to its own
/// equivalence class. Equivalently, there are 257 equivalence classes
/// and each class contains exactly one byte (plus the special EOI class).
#[inline]
pub(crate) fn is_singleton(&self) -> bool {
self.alphabet_len() == 256
}
/// Returns an iterator over all equivalence classes in this set.
pub(crate) fn iter(&self) -> ByteClassIter {
ByteClassIter { it: 0..self.alphabet_len() }
}
/// Returns an iterator of the bytes in the given equivalence class.
pub(crate) fn elements(&self, class: u8) -> ByteClassElements {
ByteClassElements { classes: self, class, bytes: 0..=255 }
}
/// Returns an iterator of byte ranges in the given equivalence class.
///
/// That is, a sequence of contiguous ranges are returned. Typically, every
/// class maps to a single contiguous range.
fn element_ranges(&self, class: u8) -> ByteClassElementRanges {
ByteClassElementRanges { elements: self.elements(class), range: None }
}
}
impl core::fmt::Debug for ByteClasses {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
if self.is_singleton() {
write!(f, "ByteClasses(<one-class-per-byte>)")
} else {
write!(f, "ByteClasses(")?;
for (i, class) in self.iter().enumerate() {
if i > 0 {
write!(f, ", ")?;
}
write!(f, "{:?} => [", class)?;
for (start, end) in self.element_ranges(class) {
if start == end {
write!(f, "{:?}", start)?;
} else {
write!(f, "{:?}-{:?}", start, end)?;
}
}
write!(f, "]")?;
}
write!(f, ")")
}
}
}
/// An iterator over each equivalence class.
#[derive(Debug)]
pub(crate) struct ByteClassIter {
it: core::ops::Range<usize>,
}
impl Iterator for ByteClassIter {
type Item = u8;
fn next(&mut self) -> Option<u8> {
self.it.next().map(|class| class.as_u8())
}
}
/// An iterator over all elements in a specific equivalence class.
#[derive(Debug)]
pub(crate) struct ByteClassElements<'a> {
classes: &'a ByteClasses,
class: u8,
bytes: core::ops::RangeInclusive<u8>,
}
impl<'a> Iterator for ByteClassElements<'a> {
type Item = u8;
fn next(&mut self) -> Option<u8> {
while let Some(byte) = self.bytes.next() {
if self.class == self.classes.get(byte) {
return Some(byte);
}
}
None
}
}
/// An iterator over all elements in an equivalence class expressed as a
/// sequence of contiguous ranges.
#[derive(Debug)]
pub(crate) struct ByteClassElementRanges<'a> {
elements: ByteClassElements<'a>,
range: Option<(u8, u8)>,
}
impl<'a> Iterator for ByteClassElementRanges<'a> {
type Item = (u8, u8);
fn next(&mut self) -> Option<(u8, u8)> {
loop {
let element = match self.elements.next() {
None => return self.range.take(),
Some(element) => element,
};
match self.range.take() {
None => {
self.range = Some((element, element));
}
Some((start, end)) => {
if usize::from(end) + 1 != usize::from(element) {
self.range = Some((element, element));
return Some((start, end));
}
self.range = Some((start, element));
}
}
}
}
}
/// A partitioning of bytes into equivalence classes.
///
/// A byte class set keeps track of an *approximation* of equivalence classes
/// of bytes during NFA construction. That is, every byte in an equivalence
/// class cannot discriminate between a match and a non-match.
///
/// Note that this may not compute the minimal set of equivalence classes.
/// Basically, any byte in a pattern given to the noncontiguous NFA builder
/// will automatically be treated as its own equivalence class. All other
/// bytes---any byte not in any pattern---will be treated as their own
/// equivalence classes. In theory, all bytes not in any pattern should
/// be part of a single equivalence class, but in practice, we only treat
/// contiguous ranges of bytes as an equivalence class. So the number of
/// classes computed may be bigger than necessary. This usually doesn't make
/// much of a difference, and keeps the implementation simple.
#[derive(Clone, Debug)]
pub(crate) struct ByteClassSet(ByteSet);
impl Default for ByteClassSet {
fn default() -> ByteClassSet {
ByteClassSet::empty()
}
}
impl ByteClassSet {
/// Create a new set of byte classes where all bytes are part of the same
/// equivalence class.
pub(crate) fn empty() -> Self {
ByteClassSet(ByteSet::empty())
}
/// Indicate the the range of byte given (inclusive) can discriminate a
/// match between it and all other bytes outside of the range.
pub(crate) fn set_range(&mut self, start: u8, end: u8) {
debug_assert!(start <= end);
if start > 0 {
self.0.add(start - 1);
}
self.0.add(end);
}
/// Convert this boolean set to a map that maps all byte values to their
/// corresponding equivalence class. The last mapping indicates the largest
/// equivalence class identifier (which is never bigger than 255).
pub(crate) fn byte_classes(&self) -> ByteClasses {
let mut classes = ByteClasses::empty();
let mut class = 0u8;
let mut b = 0u8;
loop {
classes.set(b, class);
if b == 255 {
break;
}
if self.0.contains(b) {
class = class.checked_add(1).unwrap();
}
b = b.checked_add(1).unwrap();
}
classes
}
}
/// A simple set of bytes that is reasonably cheap to copy and allocation free.
#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
pub(crate) struct ByteSet {
bits: BitSet,
}
/// The representation of a byte set. Split out so that we can define a
/// convenient Debug impl for it while keeping "ByteSet" in the output.
#[derive(Clone, Copy, Default, Eq, PartialEq)]
struct BitSet([u128; 2]);
impl ByteSet {
/// Create an empty set of bytes.
pub(crate) fn empty() -> ByteSet {
ByteSet { bits: BitSet([0; 2]) }
}
/// Add a byte to this set.
///
/// If the given byte already belongs to this set, then this is a no-op.
pub(crate) fn add(&mut self, byte: u8) {
let bucket = byte / 128;
let bit = byte % 128;
self.bits.0[usize::from(bucket)] |= 1 << bit;
}
/// Return true if and only if the given byte is in this set.
pub(crate) fn contains(&self, byte: u8) -> bool {
let bucket = byte / 128;
let bit = byte % 128;
self.bits.0[usize::from(bucket)] & (1 << bit) > 0
}
}
impl core::fmt::Debug for BitSet {
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
let mut fmtd = f.debug_set();
for b in 0u8..=255 {
if (ByteSet { bits: *self }).contains(b) {
fmtd.entry(&b);
}
}
fmtd.finish()
}
}
#[cfg(test)]
mod tests {
use alloc::{vec, vec::Vec};
use super::*;
#[test]
fn byte_classes() {
let mut set = ByteClassSet::empty();
set.set_range(b'a', b'z');
let classes = set.byte_classes();
assert_eq!(classes.get(0), 0);
assert_eq!(classes.get(1), 0);
assert_eq!(classes.get(2), 0);
assert_eq!(classes.get(b'a' - 1), 0);
assert_eq!(classes.get(b'a'), 1);
assert_eq!(classes.get(b'm'), 1);
assert_eq!(classes.get(b'z'), 1);
assert_eq!(classes.get(b'z' + 1), 2);
assert_eq!(classes.get(254), 2);
assert_eq!(classes.get(255), 2);
let mut set = ByteClassSet::empty();
set.set_range(0, 2);
set.set_range(4, 6);
let classes = set.byte_classes();
assert_eq!(classes.get(0), 0);
assert_eq!(classes.get(1), 0);
assert_eq!(classes.get(2), 0);
assert_eq!(classes.get(3), 1);
assert_eq!(classes.get(4), 2);
assert_eq!(classes.get(5), 2);
assert_eq!(classes.get(6), 2);
assert_eq!(classes.get(7), 3);
assert_eq!(classes.get(255), 3);
}
#[test]
fn full_byte_classes() {
let mut set = ByteClassSet::empty();
for b in 0u8..=255 {
set.set_range(b, b);
}
assert_eq!(set.byte_classes().alphabet_len(), 256);
}
#[test]
fn elements_typical() {
let mut set = ByteClassSet::empty();
set.set_range(b'b', b'd');
set.set_range(b'g', b'm');
set.set_range(b'z', b'z');
let classes = set.byte_classes();
// class 0: \x00-a
// class 1: b-d
// class 2: e-f
// class 3: g-m
// class 4: n-y
// class 5: z-z
// class 6: \x7B-\xFF
assert_eq!(classes.alphabet_len(), 7);
let elements = classes.elements(0).collect::<Vec<_>>();
assert_eq!(elements.len(), 98);
assert_eq!(elements[0], b'\x00');
assert_eq!(elements[97], b'a');
let elements = classes.elements(1).collect::<Vec<_>>();
assert_eq!(elements, vec![b'b', b'c', b'd'],);
let elements = classes.elements(2).collect::<Vec<_>>();
assert_eq!(elements, vec![b'e', b'f'],);
let elements = classes.elements(3).collect::<Vec<_>>();
assert_eq!(elements, vec![b'g', b'h', b'i', b'j', b'k', b'l', b'm',],);
let elements = classes.elements(4).collect::<Vec<_>>();
assert_eq!(elements.len(), 12);
assert_eq!(elements[0], b'n');
assert_eq!(elements[11], b'y');
let elements = classes.elements(5).collect::<Vec<_>>();
assert_eq!(elements, vec![b'z']);
let elements = classes.elements(6).collect::<Vec<_>>();
assert_eq!(elements.len(), 133);
assert_eq!(elements[0], b'\x7B');
assert_eq!(elements[132], b'\xFF');
}
#[test]
fn elements_singletons() {
let classes = ByteClasses::singletons();
assert_eq!(classes.alphabet_len(), 256);
let elements = classes.elements(b'a').collect::<Vec<_>>();
assert_eq!(elements, vec![b'a']);
}
#[test]
fn elements_empty() {
let classes = ByteClasses::empty();
assert_eq!(classes.alphabet_len(), 1);
let elements = classes.elements(0).collect::<Vec<_>>();
assert_eq!(elements.len(), 256);
assert_eq!(elements[0], b'\x00');
assert_eq!(elements[255], b'\xFF');
}
}

124
vendor/aho-corasick/src/util/buffer.rs vendored Normal file
View File

@ -0,0 +1,124 @@
use alloc::{vec, vec::Vec};
/// The default buffer capacity that we use for the stream buffer.
const DEFAULT_BUFFER_CAPACITY: usize = 64 * (1 << 10); // 64 KB
/// A fairly simple roll buffer for supporting stream searches.
///
/// This buffer acts as a temporary place to store a fixed amount of data when
/// reading from a stream. Its central purpose is to allow "rolling" some
/// suffix of the data to the beginning of the buffer before refilling it with
/// more data from the stream. For example, let's say we are trying to match
/// "foobar" on a stream. When we report the match, we'd like to not only
/// report the correct offsets at which the match occurs, but also the matching
/// bytes themselves. So let's say our stream is a file with the following
/// contents: `test test foobar test test`. Now assume that we happen to read
/// the aforementioned file in two chunks: `test test foo` and `bar test test`.
/// Naively, it would not be possible to report a single contiguous `foobar`
/// match, but this roll buffer allows us to do that. Namely, after the second
/// read, the contents of the buffer should be `st foobar test test`, where the
/// search should ultimately resume immediately after `foo`. (The prefix `st `
/// is included because the roll buffer saves N bytes at the end of the buffer,
/// where N is the maximum possible length of a match.)
///
/// A lot of the logic for dealing with this is unfortunately split out between
/// this roll buffer and the `StreamChunkIter`.
///
/// Note also that this buffer is not actually required to just report matches.
/// Because a `Match` is just some offsets. But it *is* required for supporting
/// things like `try_stream_replace_all` because that needs some mechanism for
/// knowing which bytes in the stream correspond to a match and which don't. So
/// when a match occurs across two `read` calls, *something* needs to retain
/// the bytes from the previous `read` call because you don't know before the
/// second read call whether a match exists or not.
#[derive(Debug)]
pub(crate) struct Buffer {
/// The raw buffer contents. This has a fixed size and never increases.
buf: Vec<u8>,
/// The minimum size of the buffer, which is equivalent to the maximum
/// possible length of a match. This corresponds to the amount that we
/// roll
min: usize,
/// The end of the contents of this buffer.
end: usize,
}
impl Buffer {
/// Create a new buffer for stream searching. The minimum buffer length
/// given should be the size of the maximum possible match length.
pub(crate) fn new(min_buffer_len: usize) -> Buffer {
let min = core::cmp::max(1, min_buffer_len);
// The minimum buffer amount is also the amount that we roll our
// buffer in order to support incremental searching. To this end,
// our actual capacity needs to be at least 1 byte bigger than our
// minimum amount, otherwise we won't have any overlap. In actuality,
// we want our buffer to be a bit bigger than that for performance
// reasons, so we set a lower bound of `8 * min`.
//
// TODO: It would be good to find a way to test the streaming
// implementation with the minimal buffer size. For now, we just
// uncomment out the next line and comment out the subsequent line.
// let capacity = 1 + min;
let capacity = core::cmp::max(min * 8, DEFAULT_BUFFER_CAPACITY);
Buffer { buf: vec![0; capacity], min, end: 0 }
}
/// Return the contents of this buffer.
#[inline]
pub(crate) fn buffer(&self) -> &[u8] {
&self.buf[..self.end]
}
/// Return the minimum size of the buffer. The only way a buffer may be
/// smaller than this is if the stream itself contains less than the
/// minimum buffer amount.
#[inline]
pub(crate) fn min_buffer_len(&self) -> usize {
self.min
}
/// Return all free capacity in this buffer.
fn free_buffer(&mut self) -> &mut [u8] {
&mut self.buf[self.end..]
}
/// Refill the contents of this buffer by reading as much as possible into
/// this buffer's free capacity. If no more bytes could be read, then this
/// returns false. Otherwise, this reads until it has filled the buffer
/// past the minimum amount.
pub(crate) fn fill<R: std::io::Read>(
&mut self,
mut rdr: R,
) -> std::io::Result<bool> {
let mut readany = false;
loop {
let readlen = rdr.read(self.free_buffer())?;
if readlen == 0 {
return Ok(readany);
}
readany = true;
self.end += readlen;
if self.buffer().len() >= self.min {
return Ok(true);
}
}
}
/// Roll the contents of the buffer so that the suffix of this buffer is
/// moved to the front and all other contents are dropped. The size of the
/// suffix corresponds precisely to the minimum buffer length.
///
/// This should only be called when the entire contents of this buffer have
/// been searched.
pub(crate) fn roll(&mut self) {
let roll_start = self
.end
.checked_sub(self.min)
.expect("buffer capacity should be bigger than minimum amount");
let roll_end = roll_start + self.min;
assert!(roll_end <= self.end);
self.buf.copy_within(roll_start..roll_end, 0);
self.end = self.min;
}
}

View File

@ -0,0 +1,258 @@
pub const BYTE_FREQUENCIES: [u8; 256] = [
55, // '\x00'
52, // '\x01'
51, // '\x02'
50, // '\x03'
49, // '\x04'
48, // '\x05'
47, // '\x06'
46, // '\x07'
45, // '\x08'
103, // '\t'
242, // '\n'
66, // '\x0b'
67, // '\x0c'
229, // '\r'
44, // '\x0e'
43, // '\x0f'
42, // '\x10'
41, // '\x11'
40, // '\x12'
39, // '\x13'
38, // '\x14'
37, // '\x15'
36, // '\x16'
35, // '\x17'
34, // '\x18'
33, // '\x19'
56, // '\x1a'
32, // '\x1b'
31, // '\x1c'
30, // '\x1d'
29, // '\x1e'
28, // '\x1f'
255, // ' '
148, // '!'
164, // '"'
149, // '#'
136, // '$'
160, // '%'
155, // '&'
173, // "'"
221, // '('
222, // ')'
134, // '*'
122, // '+'
232, // ','
202, // '-'
215, // '.'
224, // '/'
208, // '0'
220, // '1'
204, // '2'
187, // '3'
183, // '4'
179, // '5'
177, // '6'
168, // '7'
178, // '8'
200, // '9'
226, // ':'
195, // ';'
154, // '<'
184, // '='
174, // '>'
126, // '?'
120, // '@'
191, // 'A'
157, // 'B'
194, // 'C'
170, // 'D'
189, // 'E'
162, // 'F'
161, // 'G'
150, // 'H'
193, // 'I'
142, // 'J'
137, // 'K'
171, // 'L'
176, // 'M'
185, // 'N'
167, // 'O'
186, // 'P'
112, // 'Q'
175, // 'R'
192, // 'S'
188, // 'T'
156, // 'U'
140, // 'V'
143, // 'W'
123, // 'X'
133, // 'Y'
128, // 'Z'
147, // '['
138, // '\\'
146, // ']'
114, // '^'
223, // '_'
151, // '`'
249, // 'a'
216, // 'b'
238, // 'c'
236, // 'd'
253, // 'e'
227, // 'f'
218, // 'g'
230, // 'h'
247, // 'i'
135, // 'j'
180, // 'k'
241, // 'l'
233, // 'm'
246, // 'n'
244, // 'o'
231, // 'p'
139, // 'q'
245, // 'r'
243, // 's'
251, // 't'
235, // 'u'
201, // 'v'
196, // 'w'
240, // 'x'
214, // 'y'
152, // 'z'
182, // '{'
205, // '|'
181, // '}'
127, // '~'
27, // '\x7f'
212, // '\x80'
211, // '\x81'
210, // '\x82'
213, // '\x83'
228, // '\x84'
197, // '\x85'
169, // '\x86'
159, // '\x87'
131, // '\x88'
172, // '\x89'
105, // '\x8a'
80, // '\x8b'
98, // '\x8c'
96, // '\x8d'
97, // '\x8e'
81, // '\x8f'
207, // '\x90'
145, // '\x91'
116, // '\x92'
115, // '\x93'
144, // '\x94'
130, // '\x95'
153, // '\x96'
121, // '\x97'
107, // '\x98'
132, // '\x99'
109, // '\x9a'
110, // '\x9b'
124, // '\x9c'
111, // '\x9d'
82, // '\x9e'
108, // '\x9f'
118, // '\xa0'
141, // '¡'
113, // '¢'
129, // '£'
119, // '¤'
125, // '¥'
165, // '¦'
117, // '§'
92, // '¨'
106, // '©'
83, // 'ª'
72, // '«'
99, // '¬'
93, // '\xad'
65, // '®'
79, // '¯'
166, // '°'
237, // '±'
163, // '²'
199, // '³'
190, // '´'
225, // 'µ'
209, // '¶'
203, // '·'
198, // '¸'
217, // '¹'
219, // 'º'
206, // '»'
234, // '¼'
248, // '½'
158, // '¾'
239, // '¿'
255, // 'À'
255, // 'Á'
255, // 'Â'
255, // 'Ã'
255, // 'Ä'
255, // 'Å'
255, // 'Æ'
255, // 'Ç'
255, // 'È'
255, // 'É'
255, // 'Ê'
255, // 'Ë'
255, // 'Ì'
255, // 'Í'
255, // 'Î'
255, // 'Ï'
255, // 'Ð'
255, // 'Ñ'
255, // 'Ò'
255, // 'Ó'
255, // 'Ô'
255, // 'Õ'
255, // 'Ö'
255, // '×'
255, // 'Ø'
255, // 'Ù'
255, // 'Ú'
255, // 'Û'
255, // 'Ü'
255, // 'Ý'
255, // 'Þ'
255, // 'ß'
255, // 'à'
255, // 'á'
255, // 'â'
255, // 'ã'
255, // 'ä'
255, // 'å'
255, // 'æ'
255, // 'ç'
255, // 'è'
255, // 'é'
255, // 'ê'
255, // 'ë'
255, // 'ì'
255, // 'í'
255, // 'î'
255, // 'ï'
255, // 'ð'
255, // 'ñ'
255, // 'ò'
255, // 'ó'
255, // 'ô'
255, // 'õ'
255, // 'ö'
255, // '÷'
255, // 'ø'
255, // 'ù'
255, // 'ú'
255, // 'û'
255, // 'ü'
255, // 'ý'
255, // 'þ'
255, // 'ÿ'
];

26
vendor/aho-corasick/src/util/debug.rs vendored Normal file
View File

@ -0,0 +1,26 @@
/// A type that wraps a single byte with a convenient fmt::Debug impl that
/// escapes the byte.
pub(crate) struct DebugByte(pub(crate) u8);
impl core::fmt::Debug for DebugByte {
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
// Special case ASCII space. It's too hard to read otherwise, so
// put quotes around it. I sometimes wonder whether just '\x20' would
// be better...
if self.0 == b' ' {
return write!(f, "' '");
}
// 10 bytes is enough to cover any output from ascii::escape_default.
let mut bytes = [0u8; 10];
let mut len = 0;
for (i, mut b) in core::ascii::escape_default(self.0).enumerate() {
// capitalize \xab to \xAB
if i >= 2 && b'a' <= b && b <= b'f' {
b -= 32;
}
bytes[len] = b;
len += 1;
}
write!(f, "{}", core::str::from_utf8(&bytes[..len]).unwrap())
}
}

259
vendor/aho-corasick/src/util/error.rs vendored Normal file
View File

@ -0,0 +1,259 @@
use crate::util::{
primitives::{PatternID, SmallIndex},
search::MatchKind,
};
/// An error that occurred during the construction of an Aho-Corasick
/// automaton.
///
/// Build errors occur when some kind of limit has been exceeded, either in the
/// number of states, the number of patterns of the length of a pattern. These
/// limits aren't part of the public API, but they should generally be large
/// enough to handle most use cases.
///
/// When the `std` feature is enabled, this implements the `std::error::Error`
/// trait.
#[derive(Clone, Debug)]
pub struct BuildError {
kind: ErrorKind,
}
/// The kind of error that occurred.
#[derive(Clone, Debug)]
enum ErrorKind {
/// An error that occurs when allocating a new state would result in an
/// identifier that exceeds the capacity of a `StateID`.
StateIDOverflow {
/// The maximum possible id.
max: u64,
/// The maximum ID requested.
requested_max: u64,
},
/// An error that occurs when adding a pattern to an Aho-Corasick
/// automaton would result in an identifier that exceeds the capacity of a
/// `PatternID`.
PatternIDOverflow {
/// The maximum possible id.
max: u64,
/// The maximum ID requested.
requested_max: u64,
},
/// Occurs when a pattern string is given to the Aho-Corasick constructor
/// that is too long.
PatternTooLong {
/// The ID of the pattern that was too long.
pattern: PatternID,
/// The length that was too long.
len: usize,
},
}
impl BuildError {
pub(crate) fn state_id_overflow(
max: u64,
requested_max: u64,
) -> BuildError {
BuildError { kind: ErrorKind::StateIDOverflow { max, requested_max } }
}
pub(crate) fn pattern_id_overflow(
max: u64,
requested_max: u64,
) -> BuildError {
BuildError {
kind: ErrorKind::PatternIDOverflow { max, requested_max },
}
}
pub(crate) fn pattern_too_long(
pattern: PatternID,
len: usize,
) -> BuildError {
BuildError { kind: ErrorKind::PatternTooLong { pattern, len } }
}
}
#[cfg(feature = "std")]
impl std::error::Error for BuildError {}
impl core::fmt::Display for BuildError {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
match self.kind {
ErrorKind::StateIDOverflow { max, requested_max } => {
write!(
f,
"state identifier overflow: failed to create state ID \
from {}, which exceeds the max of {}",
requested_max, max,
)
}
ErrorKind::PatternIDOverflow { max, requested_max } => {
write!(
f,
"pattern identifier overflow: failed to create pattern ID \
from {}, which exceeds the max of {}",
requested_max, max,
)
}
ErrorKind::PatternTooLong { pattern, len } => {
write!(
f,
"pattern {} with length {} exceeds \
the maximum pattern length of {}",
pattern.as_usize(),
len,
SmallIndex::MAX.as_usize(),
)
}
}
}
}
/// An error that occurred during an Aho-Corasick search.
///
/// An error that occurs during a search is limited to some kind of
/// misconfiguration that resulted in an illegal call. Stated differently,
/// whether an error occurs is not dependent on the specific bytes in the
/// haystack.
///
/// Examples of misconfiguration:
///
/// * Executing a stream or overlapping search on a searcher that was built was
/// something other than [`MatchKind::Standard`](crate::MatchKind::Standard)
/// semantics.
/// * Requested an anchored or an unanchored search on a searcher that doesn't
/// support unanchored or anchored searches, respectively.
///
/// When the `std` feature is enabled, this implements the `std::error::Error`
/// trait.
#[derive(Clone, Debug, Eq, PartialEq)]
pub struct MatchError(alloc::boxed::Box<MatchErrorKind>);
impl MatchError {
/// Create a new error value with the given kind.
///
/// This is a more verbose version of the kind-specific constructors, e.g.,
/// `MatchError::unsupported_stream`.
pub fn new(kind: MatchErrorKind) -> MatchError {
MatchError(alloc::boxed::Box::new(kind))
}
/// Returns a reference to the underlying error kind.
pub fn kind(&self) -> &MatchErrorKind {
&self.0
}
/// Create a new "invalid anchored search" error. This occurs when the
/// caller requests an anchored search but where anchored searches aren't
/// supported.
///
/// This is the same as calling `MatchError::new` with a
/// [`MatchErrorKind::InvalidInputAnchored`] kind.
pub fn invalid_input_anchored() -> MatchError {
MatchError::new(MatchErrorKind::InvalidInputAnchored)
}
/// Create a new "invalid unanchored search" error. This occurs when the
/// caller requests an unanchored search but where unanchored searches
/// aren't supported.
///
/// This is the same as calling `MatchError::new` with a
/// [`MatchErrorKind::InvalidInputUnanchored`] kind.
pub fn invalid_input_unanchored() -> MatchError {
MatchError::new(MatchErrorKind::InvalidInputUnanchored)
}
/// Create a new "unsupported stream search" error. This occurs when the
/// caller requests a stream search while using an Aho-Corasick automaton
/// with a match kind other than [`MatchKind::Standard`].
///
/// The match kind given should be the match kind of the automaton. It
/// should never be `MatchKind::Standard`.
pub fn unsupported_stream(got: MatchKind) -> MatchError {
MatchError::new(MatchErrorKind::UnsupportedStream { got })
}
/// Create a new "unsupported overlapping search" error. This occurs when
/// the caller requests an overlapping search while using an Aho-Corasick
/// automaton with a match kind other than [`MatchKind::Standard`].
///
/// The match kind given should be the match kind of the automaton. It
/// should never be `MatchKind::Standard`.
pub fn unsupported_overlapping(got: MatchKind) -> MatchError {
MatchError::new(MatchErrorKind::UnsupportedOverlapping { got })
}
/// Create a new "unsupported empty pattern" error. This occurs when the
/// caller requests a search for which matching an automaton that contains
/// an empty pattern string is not supported.
pub fn unsupported_empty() -> MatchError {
MatchError::new(MatchErrorKind::UnsupportedEmpty)
}
}
/// The underlying kind of a [`MatchError`].
///
/// This is a **non-exhaustive** enum. That means new variants may be added in
/// a semver-compatible release.
#[non_exhaustive]
#[derive(Clone, Debug, Eq, PartialEq)]
pub enum MatchErrorKind {
/// An error indicating that an anchored search was requested, but from a
/// searcher that was built without anchored support.
InvalidInputAnchored,
/// An error indicating that an unanchored search was requested, but from a
/// searcher that was built without unanchored support.
InvalidInputUnanchored,
/// An error indicating that a stream search was attempted on an
/// Aho-Corasick automaton with an unsupported `MatchKind`.
UnsupportedStream {
/// The match semantics for the automaton that was used.
got: MatchKind,
},
/// An error indicating that an overlapping search was attempted on an
/// Aho-Corasick automaton with an unsupported `MatchKind`.
UnsupportedOverlapping {
/// The match semantics for the automaton that was used.
got: MatchKind,
},
/// An error indicating that the operation requested doesn't support
/// automatons that contain an empty pattern string.
UnsupportedEmpty,
}
#[cfg(feature = "std")]
impl std::error::Error for MatchError {}
impl core::fmt::Display for MatchError {
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
match *self.kind() {
MatchErrorKind::InvalidInputAnchored => {
write!(f, "anchored searches are not supported or enabled")
}
MatchErrorKind::InvalidInputUnanchored => {
write!(f, "unanchored searches are not supported or enabled")
}
MatchErrorKind::UnsupportedStream { got } => {
write!(
f,
"match kind {:?} does not support stream searching",
got,
)
}
MatchErrorKind::UnsupportedOverlapping { got } => {
write!(
f,
"match kind {:?} does not support overlapping searches",
got,
)
}
MatchErrorKind::UnsupportedEmpty => {
write!(
f,
"matching with an empty pattern string is not \
supported for this operation",
)
}
}
}
}

284
vendor/aho-corasick/src/util/int.rs vendored Normal file
View File

@ -0,0 +1,284 @@
/*!
This module provides several integer oriented traits for converting between
both fixed size integers and integers whose size varies based on the target
(like `usize`).
The main design principle for this module is to centralize all uses of `as`.
The thinking here is that `as` makes it very easy to perform accidental lossy
conversions, and if we centralize all its uses here under more descriptive
higher level operations, its use and correctness becomes easier to audit.
This was copied mostly wholesale from `regex-automata`.
NOTE: for simplicity, we don't take target pointer width into account here for
`usize` conversions. Since we currently only panic in debug mode, skipping the
check when it can be proven it isn't needed at compile time doesn't really
matter. Now, if we wind up wanting to do as many checks as possible in release
mode, then we would want to skip those when we know the conversions are always
non-lossy.
*/
pub(crate) trait U8 {
fn as_usize(self) -> usize;
}
impl U8 for u8 {
fn as_usize(self) -> usize {
usize::from(self)
}
}
pub(crate) trait U16 {
fn as_usize(self) -> usize;
fn low_u8(self) -> u8;
fn high_u8(self) -> u8;
}
impl U16 for u16 {
fn as_usize(self) -> usize {
usize::from(self)
}
fn low_u8(self) -> u8 {
self as u8
}
fn high_u8(self) -> u8 {
(self >> 8) as u8
}
}
pub(crate) trait U32 {
fn as_usize(self) -> usize;
fn low_u8(self) -> u8;
fn low_u16(self) -> u16;
fn high_u16(self) -> u16;
}
impl U32 for u32 {
#[inline]
fn as_usize(self) -> usize {
#[cfg(debug_assertions)]
{
usize::try_from(self).expect("u32 overflowed usize")
}
#[cfg(not(debug_assertions))]
{
self as usize
}
}
fn low_u8(self) -> u8 {
self as u8
}
fn low_u16(self) -> u16 {
self as u16
}
fn high_u16(self) -> u16 {
(self >> 16) as u16
}
}
pub(crate) trait U64 {
fn as_usize(self) -> usize;
fn low_u8(self) -> u8;
fn low_u16(self) -> u16;
fn low_u32(self) -> u32;
fn high_u32(self) -> u32;
}
impl U64 for u64 {
fn as_usize(self) -> usize {
#[cfg(debug_assertions)]
{
usize::try_from(self).expect("u64 overflowed usize")
}
#[cfg(not(debug_assertions))]
{
self as usize
}
}
fn low_u8(self) -> u8 {
self as u8
}
fn low_u16(self) -> u16 {
self as u16
}
fn low_u32(self) -> u32 {
self as u32
}
fn high_u32(self) -> u32 {
(self >> 32) as u32
}
}
pub(crate) trait I8 {
fn as_usize(self) -> usize;
fn to_bits(self) -> u8;
fn from_bits(n: u8) -> i8;
}
impl I8 for i8 {
fn as_usize(self) -> usize {
#[cfg(debug_assertions)]
{
usize::try_from(self).expect("i8 overflowed usize")
}
#[cfg(not(debug_assertions))]
{
self as usize
}
}
fn to_bits(self) -> u8 {
self as u8
}
fn from_bits(n: u8) -> i8 {
n as i8
}
}
pub(crate) trait I32 {
fn as_usize(self) -> usize;
fn to_bits(self) -> u32;
fn from_bits(n: u32) -> i32;
}
impl I32 for i32 {
fn as_usize(self) -> usize {
#[cfg(debug_assertions)]
{
usize::try_from(self).expect("i32 overflowed usize")
}
#[cfg(not(debug_assertions))]
{
self as usize
}
}
fn to_bits(self) -> u32 {
self as u32
}
fn from_bits(n: u32) -> i32 {
n as i32
}
}
pub(crate) trait I64 {
fn as_usize(self) -> usize;
fn to_bits(self) -> u64;
fn from_bits(n: u64) -> i64;
}
impl I64 for i64 {
fn as_usize(self) -> usize {
#[cfg(debug_assertions)]
{
usize::try_from(self).expect("i64 overflowed usize")
}
#[cfg(not(debug_assertions))]
{
self as usize
}
}
fn to_bits(self) -> u64 {
self as u64
}
fn from_bits(n: u64) -> i64 {
n as i64
}
}
pub(crate) trait Usize {
fn as_u8(self) -> u8;
fn as_u16(self) -> u16;
fn as_u32(self) -> u32;
fn as_u64(self) -> u64;
}
impl Usize for usize {
fn as_u8(self) -> u8 {
#[cfg(debug_assertions)]
{
u8::try_from(self).expect("usize overflowed u8")
}
#[cfg(not(debug_assertions))]
{
self as u8
}
}
fn as_u16(self) -> u16 {
#[cfg(debug_assertions)]
{
u16::try_from(self).expect("usize overflowed u16")
}
#[cfg(not(debug_assertions))]
{
self as u16
}
}
fn as_u32(self) -> u32 {
#[cfg(debug_assertions)]
{
u32::try_from(self).expect("usize overflowed u32")
}
#[cfg(not(debug_assertions))]
{
self as u32
}
}
fn as_u64(self) -> u64 {
#[cfg(debug_assertions)]
{
u64::try_from(self).expect("usize overflowed u64")
}
#[cfg(not(debug_assertions))]
{
self as u64
}
}
}
// Pointers aren't integers, but we convert pointers to integers to perform
// offset arithmetic in some places. (And no, we don't convert the integers
// back to pointers.) So add 'as_usize' conversions here too for completeness.
//
// These 'as' casts are actually okay because they're always non-lossy. But the
// idea here is to just try and remove as much 'as' as possible, particularly
// in this crate where we are being really paranoid about offsets and making
// sure we don't panic on inputs that might be untrusted. This way, the 'as'
// casts become easier to audit if they're all in one place, even when some of
// them are actually okay 100% of the time.
pub(crate) trait Pointer {
fn as_usize(self) -> usize;
}
impl<T> Pointer for *const T {
fn as_usize(self) -> usize {
self as usize
}
}
pub(crate) trait PointerMut {
fn as_usize(self) -> usize;
}
impl<T> PointerMut for *mut T {
fn as_usize(self) -> usize {
self as usize
}
}

12
vendor/aho-corasick/src/util/mod.rs vendored Normal file
View File

@ -0,0 +1,12 @@
pub(crate) mod alphabet;
#[cfg(feature = "std")]
pub(crate) mod buffer;
pub(crate) mod byte_frequencies;
pub(crate) mod debug;
pub(crate) mod error;
pub(crate) mod int;
pub(crate) mod prefilter;
pub(crate) mod primitives;
pub(crate) mod remapper;
pub(crate) mod search;
pub(crate) mod special;

View File

@ -0,0 +1,924 @@
use core::{
cmp,
fmt::Debug,
panic::{RefUnwindSafe, UnwindSafe},
u8,
};
use alloc::{sync::Arc, vec, vec::Vec};
use crate::{
packed,
util::{
alphabet::ByteSet,
search::{Match, MatchKind, Span},
},
};
/// A prefilter for accelerating a search.
///
/// This crate uses prefilters in the core search implementations to accelerate
/// common cases. They typically only apply to cases where there are a small
/// number of patterns (less than 100 or so), but when they do, thoughput can
/// be boosted considerably, perhaps by an order of magnitude. When a prefilter
/// is active, it is used whenever a search enters an automaton's start state.
///
/// Currently, prefilters cannot be constructed by
/// callers. A `Prefilter` can only be accessed via the
/// [`Automaton::prefilter`](crate::automaton::Automaton::prefilter)
/// method and used to execute a search. In other words, a prefilter can be
/// used to optimize your own search implementation if necessary, but cannot do
/// much else. If you have a use case for more APIs, please submit an issue.
#[derive(Clone, Debug)]
pub struct Prefilter {
finder: Arc<dyn PrefilterI>,
memory_usage: usize,
}
impl Prefilter {
/// Execute a search in the haystack within the span given. If a match or
/// a possible match is returned, then it is guaranteed to occur within
/// the bounds of the span.
///
/// If the span provided is invalid for the given haystack, then behavior
/// is unspecified.
#[inline]
pub fn find_in(&self, haystack: &[u8], span: Span) -> Candidate {
self.finder.find_in(haystack, span)
}
#[inline]
pub(crate) fn memory_usage(&self) -> usize {
self.memory_usage
}
}
/// A candidate is the result of running a prefilter on a haystack at a
/// particular position.
///
/// The result is either no match, a confirmed match or a possible match.
///
/// When no match is returned, the prefilter is guaranteeing that no possible
/// match can be found in the haystack, and the caller may trust this. That is,
/// all correct prefilters must never report false negatives.
///
/// In some cases, a prefilter can confirm a match very quickly, in which case,
/// the caller may use this to stop what it's doing and report the match. In
/// this case, prefilter implementations must never report a false positive.
/// In other cases, the prefilter can only report a potential match, in which
/// case the callers must attempt to confirm the match. In this case, prefilter
/// implementations are permitted to return false positives.
#[derive(Clone, Debug)]
pub enum Candidate {
/// No match was found. Since false negatives are not possible, this means
/// the search can quit as it is guaranteed not to find another match.
None,
/// A confirmed match was found. Callers do not need to confirm it.
Match(Match),
/// The start of a possible match was found. Callers must confirm it before
/// reporting it as a match.
PossibleStartOfMatch(usize),
}
impl Candidate {
/// Convert this candidate into an option. This is useful when callers
/// do not distinguish between true positives and false positives (i.e.,
/// the caller must always confirm the match).
pub fn into_option(self) -> Option<usize> {
match self {
Candidate::None => None,
Candidate::Match(ref m) => Some(m.start()),
Candidate::PossibleStartOfMatch(start) => Some(start),
}
}
}
/// A prefilter describes the behavior of fast literal scanners for quickly
/// skipping past bytes in the haystack that we know cannot possibly
/// participate in a match.
trait PrefilterI:
Send + Sync + RefUnwindSafe + UnwindSafe + Debug + 'static
{
/// Returns the next possible match candidate. This may yield false
/// positives, so callers must confirm a match starting at the position
/// returned. This, however, must never produce false negatives. That is,
/// this must, at minimum, return the starting position of the next match
/// in the given haystack after or at the given position.
fn find_in(&self, haystack: &[u8], span: Span) -> Candidate;
}
impl<P: PrefilterI + ?Sized> PrefilterI for Arc<P> {
#[inline(always)]
fn find_in(&self, haystack: &[u8], span: Span) -> Candidate {
(**self).find_in(haystack, span)
}
}
/// A builder for constructing the best possible prefilter. When constructed,
/// this builder will heuristically select the best prefilter it can build,
/// if any, and discard the rest.
#[derive(Debug)]
pub(crate) struct Builder {
count: usize,
ascii_case_insensitive: bool,
start_bytes: StartBytesBuilder,
rare_bytes: RareBytesBuilder,
memmem: MemmemBuilder,
packed: Option<packed::Builder>,
// If we run across a condition that suggests we shouldn't use a prefilter
// at all (like an empty pattern), then disable prefilters entirely.
enabled: bool,
}
impl Builder {
/// Create a new builder for constructing the best possible prefilter.
pub(crate) fn new(kind: MatchKind) -> Builder {
let pbuilder = kind
.as_packed()
.map(|kind| packed::Config::new().match_kind(kind).builder());
Builder {
count: 0,
ascii_case_insensitive: false,
start_bytes: StartBytesBuilder::new(),
rare_bytes: RareBytesBuilder::new(),
memmem: MemmemBuilder::default(),
packed: pbuilder,
enabled: true,
}
}
/// Enable ASCII case insensitivity. When set, byte strings added to this
/// builder will be interpreted without respect to ASCII case.
pub(crate) fn ascii_case_insensitive(mut self, yes: bool) -> Builder {
self.ascii_case_insensitive = yes;
self.start_bytes = self.start_bytes.ascii_case_insensitive(yes);
self.rare_bytes = self.rare_bytes.ascii_case_insensitive(yes);
self
}
/// Return a prefilter suitable for quickly finding potential matches.
///
/// All patterns added to an Aho-Corasick automaton should be added to this
/// builder before attempting to construct the prefilter.
pub(crate) fn build(&self) -> Option<Prefilter> {
if !self.enabled {
debug!("prefilter not enabled, skipping");
return None;
}
// If we only have one pattern, then deferring to memmem is always
// the best choice. This is kind of a weird case, because, well, why
// use Aho-Corasick if you only have one pattern? But maybe you don't
// know exactly how many patterns you'll get up front, and you need to
// support the option of multiple patterns. So instead of relying on
// the caller to branch and use memmem explicitly, we just do it for
// them.
if !self.ascii_case_insensitive {
if let Some(pre) = self.memmem.build() {
debug!("using memmem prefilter");
return Some(pre);
}
}
let (packed, patlen, minlen) = if self.ascii_case_insensitive {
(None, usize::MAX, 0)
} else {
let patlen = self.packed.as_ref().map_or(usize::MAX, |p| p.len());
let minlen = self.packed.as_ref().map_or(0, |p| p.minimum_len());
let packed =
self.packed.as_ref().and_then(|b| b.build()).map(|s| {
let memory_usage = s.memory_usage();
debug!(
"built packed prefilter (len: {}, \
minimum pattern len: {}, memory usage: {}) \
for consideration",
patlen, minlen, memory_usage,
);
Prefilter { finder: Arc::new(Packed(s)), memory_usage }
});
(packed, patlen, minlen)
};
match (self.start_bytes.build(), self.rare_bytes.build()) {
// If we could build both start and rare prefilters, then there are
// a few cases in which we'd want to use the start-byte prefilter
// over the rare-byte prefilter, since the former has lower
// overhead.
(prestart @ Some(_), prerare @ Some(_)) => {
debug!(
"both start (len={}, rank={}) and \
rare (len={}, rank={}) byte prefilters \
are available",
self.start_bytes.count,
self.start_bytes.rank_sum,
self.rare_bytes.count,
self.rare_bytes.rank_sum,
);
if patlen <= 16
&& minlen >= 2
&& self.start_bytes.count >= 3
&& self.rare_bytes.count >= 3
{
debug!(
"start and rare byte prefilters available, but \
they're probably slower than packed so using \
packed"
);
return packed;
}
// If the start-byte prefilter can scan for a smaller number
// of bytes than the rare-byte prefilter, then it's probably
// faster.
let has_fewer_bytes =
self.start_bytes.count < self.rare_bytes.count;
// Otherwise, if the combined frequency rank of the detected
// bytes in the start-byte prefilter is "close" to the combined
// frequency rank of the rare-byte prefilter, then we pick
// the start-byte prefilter even if the rare-byte prefilter
// heuristically searches for rare bytes. This is because the
// rare-byte prefilter has higher constant costs, so we tend to
// prefer the start-byte prefilter when we can.
let has_rarer_bytes =
self.start_bytes.rank_sum <= self.rare_bytes.rank_sum + 50;
if has_fewer_bytes {
debug!(
"using start byte prefilter because it has fewer
bytes to search for than the rare byte prefilter",
);
prestart
} else if has_rarer_bytes {
debug!(
"using start byte prefilter because its byte \
frequency rank was determined to be \
\"good enough\" relative to the rare byte prefilter \
byte frequency rank",
);
prestart
} else {
debug!("using rare byte prefilter");
prerare
}
}
(prestart @ Some(_), None) => {
if patlen <= 16 && minlen >= 2 && self.start_bytes.count >= 3 {
debug!(
"start byte prefilter available, but \
it's probably slower than packed so using \
packed"
);
return packed;
}
debug!(
"have start byte prefilter but not rare byte prefilter, \
so using start byte prefilter",
);
prestart
}
(None, prerare @ Some(_)) => {
if patlen <= 16 && minlen >= 2 && self.rare_bytes.count >= 3 {
debug!(
"rare byte prefilter available, but \
it's probably slower than packed so using \
packed"
);
return packed;
}
debug!(
"have rare byte prefilter but not start byte prefilter, \
so using rare byte prefilter",
);
prerare
}
(None, None) if self.ascii_case_insensitive => {
debug!(
"no start or rare byte prefilter and ASCII case \
insensitivity was enabled, so skipping prefilter",
);
None
}
(None, None) => {
if packed.is_some() {
debug!("falling back to packed prefilter");
} else {
debug!("no prefilter available");
}
packed
}
}
}
/// Add a literal string to this prefilter builder.
pub(crate) fn add(&mut self, bytes: &[u8]) {
if bytes.is_empty() {
self.enabled = false;
}
if !self.enabled {
return;
}
self.count += 1;
self.start_bytes.add(bytes);
self.rare_bytes.add(bytes);
self.memmem.add(bytes);
if let Some(ref mut pbuilder) = self.packed {
pbuilder.add(bytes);
}
}
}
/// A type that wraps a packed searcher and implements the `Prefilter`
/// interface.
#[derive(Clone, Debug)]
struct Packed(packed::Searcher);
impl PrefilterI for Packed {
fn find_in(&self, haystack: &[u8], span: Span) -> Candidate {
self.0
.find_in(&haystack, span)
.map_or(Candidate::None, Candidate::Match)
}
}
/// A builder for constructing a prefilter that uses memmem.
#[derive(Debug, Default)]
struct MemmemBuilder {
/// The number of patterns that have been added.
count: usize,
/// The singular pattern to search for. This is only set when count==1.
one: Option<Vec<u8>>,
}
impl MemmemBuilder {
fn build(&self) -> Option<Prefilter> {
#[cfg(all(feature = "std", feature = "perf-literal"))]
fn imp(builder: &MemmemBuilder) -> Option<Prefilter> {
let pattern = builder.one.as_ref()?;
assert_eq!(1, builder.count);
let finder = Arc::new(Memmem(
memchr::memmem::Finder::new(pattern).into_owned(),
));
let memory_usage = pattern.len();
Some(Prefilter { finder, memory_usage })
}
#[cfg(not(all(feature = "std", feature = "perf-literal")))]
fn imp(_: &MemmemBuilder) -> Option<Prefilter> {
None
}
imp(self)
}
fn add(&mut self, bytes: &[u8]) {
self.count += 1;
if self.count == 1 {
self.one = Some(bytes.to_vec());
} else {
self.one = None;
}
}
}
/// A type that wraps a SIMD accelerated single substring search from the
/// `memchr` crate for use as a prefilter.
///
/// Currently, this prefilter is only active for Aho-Corasick searchers with
/// a single pattern. In theory, this could be extended to support searchers
/// that have a common prefix of more than one byte (for one byte, we would use
/// memchr), but it's not clear if it's worth it or not.
///
/// Also, unfortunately, this currently also requires the 'std' feature to
/// be enabled. That's because memchr doesn't have a no-std-but-with-alloc
/// mode, and so APIs like Finder::into_owned aren't available when 'std' is
/// disabled. But there should be an 'alloc' feature that brings in APIs like
/// Finder::into_owned but doesn't use std-only features like runtime CPU
/// feature detection.
#[cfg(all(feature = "std", feature = "perf-literal"))]
#[derive(Clone, Debug)]
struct Memmem(memchr::memmem::Finder<'static>);
#[cfg(all(feature = "std", feature = "perf-literal"))]
impl PrefilterI for Memmem {
fn find_in(&self, haystack: &[u8], span: Span) -> Candidate {
use crate::util::primitives::PatternID;
self.0.find(&haystack[span]).map_or(Candidate::None, |i| {
let start = span.start + i;
let end = start + self.0.needle().len();
// N.B. We can declare a match and use a fixed pattern ID here
// because a Memmem prefilter is only ever created for searchers
// with exactly one pattern. Thus, every match is always a match
// and it is always for the first and only pattern.
Candidate::Match(Match::new(PatternID::ZERO, start..end))
})
}
}
/// A builder for constructing a rare byte prefilter.
///
/// A rare byte prefilter attempts to pick out a small set of rare bytes that
/// occurr in the patterns, and then quickly scan to matches of those rare
/// bytes.
#[derive(Clone, Debug)]
struct RareBytesBuilder {
/// Whether this prefilter should account for ASCII case insensitivity or
/// not.
ascii_case_insensitive: bool,
/// A set of rare bytes, indexed by byte value.
rare_set: ByteSet,
/// A set of byte offsets associated with bytes in a pattern. An entry
/// corresponds to a particular bytes (its index) and is only non-zero if
/// the byte occurred at an offset greater than 0 in at least one pattern.
///
/// If a byte's offset is not representable in 8 bits, then the rare bytes
/// prefilter becomes inert.
byte_offsets: RareByteOffsets,
/// Whether this is available as a prefilter or not. This can be set to
/// false during construction if a condition is seen that invalidates the
/// use of the rare-byte prefilter.
available: bool,
/// The number of bytes set to an active value in `byte_offsets`.
count: usize,
/// The sum of frequency ranks for the rare bytes detected. This is
/// intended to give a heuristic notion of how rare the bytes are.
rank_sum: u16,
}
/// A set of byte offsets, keyed by byte.
#[derive(Clone, Copy)]
struct RareByteOffsets {
/// Each entry corresponds to the maximum offset of the corresponding
/// byte across all patterns seen.
set: [RareByteOffset; 256],
}
impl RareByteOffsets {
/// Create a new empty set of rare byte offsets.
pub(crate) fn empty() -> RareByteOffsets {
RareByteOffsets { set: [RareByteOffset::default(); 256] }
}
/// Add the given offset for the given byte to this set. If the offset is
/// greater than the existing offset, then it overwrites the previous
/// value and returns false. If there is no previous value set, then this
/// sets it and returns true.
pub(crate) fn set(&mut self, byte: u8, off: RareByteOffset) {
self.set[byte as usize].max =
cmp::max(self.set[byte as usize].max, off.max);
}
}
impl core::fmt::Debug for RareByteOffsets {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
let mut offsets = vec![];
for off in self.set.iter() {
if off.max > 0 {
offsets.push(off);
}
}
f.debug_struct("RareByteOffsets").field("set", &offsets).finish()
}
}
/// Offsets associated with an occurrence of a "rare" byte in any of the
/// patterns used to construct a single Aho-Corasick automaton.
#[derive(Clone, Copy, Debug)]
struct RareByteOffset {
/// The maximum offset at which a particular byte occurs from the start
/// of any pattern. This is used as a shift amount. That is, when an
/// occurrence of this byte is found, the candidate position reported by
/// the prefilter is `position_of_byte - max`, such that the automaton
/// will begin its search at a position that is guaranteed to observe a
/// match.
///
/// To avoid accidentally quadratic behavior, a prefilter is considered
/// ineffective when it is asked to start scanning from a position that it
/// has already scanned past.
///
/// Using a `u8` here means that if we ever see a pattern that's longer
/// than 255 bytes, then the entire rare byte prefilter is disabled.
max: u8,
}
impl Default for RareByteOffset {
fn default() -> RareByteOffset {
RareByteOffset { max: 0 }
}
}
impl RareByteOffset {
/// Create a new rare byte offset. If the given offset is too big, then
/// None is returned. In that case, callers should render the rare bytes
/// prefilter inert.
fn new(max: usize) -> Option<RareByteOffset> {
if max > u8::MAX as usize {
None
} else {
Some(RareByteOffset { max: max as u8 })
}
}
}
impl RareBytesBuilder {
/// Create a new builder for constructing a rare byte prefilter.
fn new() -> RareBytesBuilder {
RareBytesBuilder {
ascii_case_insensitive: false,
rare_set: ByteSet::empty(),
byte_offsets: RareByteOffsets::empty(),
available: true,
count: 0,
rank_sum: 0,
}
}
/// Enable ASCII case insensitivity. When set, byte strings added to this
/// builder will be interpreted without respect to ASCII case.
fn ascii_case_insensitive(mut self, yes: bool) -> RareBytesBuilder {
self.ascii_case_insensitive = yes;
self
}
/// Build the rare bytes prefilter.
///
/// If there are more than 3 distinct rare bytes found, or if heuristics
/// otherwise determine that this prefilter should not be used, then `None`
/// is returned.
fn build(&self) -> Option<Prefilter> {
#[cfg(feature = "perf-literal")]
fn imp(builder: &RareBytesBuilder) -> Option<Prefilter> {
if !builder.available || builder.count > 3 {
return None;
}
let (mut bytes, mut len) = ([0; 3], 0);
for b in 0..=255 {
if builder.rare_set.contains(b) {
bytes[len] = b as u8;
len += 1;
}
}
let finder: Arc<dyn PrefilterI> = match len {
0 => return None,
1 => Arc::new(RareBytesOne {
byte1: bytes[0],
offset: builder.byte_offsets.set[bytes[0] as usize],
}),
2 => Arc::new(RareBytesTwo {
offsets: builder.byte_offsets,
byte1: bytes[0],
byte2: bytes[1],
}),
3 => Arc::new(RareBytesThree {
offsets: builder.byte_offsets,
byte1: bytes[0],
byte2: bytes[1],
byte3: bytes[2],
}),
_ => unreachable!(),
};
Some(Prefilter { finder, memory_usage: 0 })
}
#[cfg(not(feature = "perf-literal"))]
fn imp(_: &RareBytesBuilder) -> Option<Prefilter> {
None
}
imp(self)
}
/// Add a byte string to this builder.
///
/// All patterns added to an Aho-Corasick automaton should be added to this
/// builder before attempting to construct the prefilter.
fn add(&mut self, bytes: &[u8]) {
// If we've already given up, then do nothing.
if !self.available {
return;
}
// If we've already blown our budget, then don't waste time looking
// for more rare bytes.
if self.count > 3 {
self.available = false;
return;
}
// If the pattern is too long, then our offset table is bunk, so
// give up.
if bytes.len() >= 256 {
self.available = false;
return;
}
let mut rarest = match bytes.get(0) {
None => return,
Some(&b) => (b, freq_rank(b)),
};
// The idea here is to look for the rarest byte in each pattern, and
// add that to our set. As a special exception, if we see a byte that
// we've already added, then we immediately stop and choose that byte,
// even if there's another rare byte in the pattern. This helps us
// apply the rare byte optimization in more cases by attempting to pick
// bytes that are in common between patterns. So for example, if we
// were searching for `Sherlock` and `lockjaw`, then this would pick
// `k` for both patterns, resulting in the use of `memchr` instead of
// `memchr2` for `k` and `j`.
let mut found = false;
for (pos, &b) in bytes.iter().enumerate() {
self.set_offset(pos, b);
if found {
continue;
}
if self.rare_set.contains(b) {
found = true;
continue;
}
let rank = freq_rank(b);
if rank < rarest.1 {
rarest = (b, rank);
}
}
if !found {
self.add_rare_byte(rarest.0);
}
}
fn set_offset(&mut self, pos: usize, byte: u8) {
// This unwrap is OK because pos is never bigger than our max.
let offset = RareByteOffset::new(pos).unwrap();
self.byte_offsets.set(byte, offset);
if self.ascii_case_insensitive {
self.byte_offsets.set(opposite_ascii_case(byte), offset);
}
}
fn add_rare_byte(&mut self, byte: u8) {
self.add_one_rare_byte(byte);
if self.ascii_case_insensitive {
self.add_one_rare_byte(opposite_ascii_case(byte));
}
}
fn add_one_rare_byte(&mut self, byte: u8) {
if !self.rare_set.contains(byte) {
self.rare_set.add(byte);
self.count += 1;
self.rank_sum += freq_rank(byte) as u16;
}
}
}
/// A prefilter for scanning for a single "rare" byte.
#[cfg(feature = "perf-literal")]
#[derive(Clone, Debug)]
struct RareBytesOne {
byte1: u8,
offset: RareByteOffset,
}
#[cfg(feature = "perf-literal")]
impl PrefilterI for RareBytesOne {
fn find_in(&self, haystack: &[u8], span: Span) -> Candidate {
memchr::memchr(self.byte1, &haystack[span])
.map(|i| {
let pos = span.start + i;
cmp::max(
span.start,
pos.saturating_sub(usize::from(self.offset.max)),
)
})
.map_or(Candidate::None, Candidate::PossibleStartOfMatch)
}
}
/// A prefilter for scanning for two "rare" bytes.
#[cfg(feature = "perf-literal")]
#[derive(Clone, Debug)]
struct RareBytesTwo {
offsets: RareByteOffsets,
byte1: u8,
byte2: u8,
}
#[cfg(feature = "perf-literal")]
impl PrefilterI for RareBytesTwo {
fn find_in(&self, haystack: &[u8], span: Span) -> Candidate {
memchr::memchr2(self.byte1, self.byte2, &haystack[span])
.map(|i| {
let pos = span.start + i;
let offset = self.offsets.set[usize::from(haystack[pos])].max;
cmp::max(span.start, pos.saturating_sub(usize::from(offset)))
})
.map_or(Candidate::None, Candidate::PossibleStartOfMatch)
}
}
/// A prefilter for scanning for three "rare" bytes.
#[cfg(feature = "perf-literal")]
#[derive(Clone, Debug)]
struct RareBytesThree {
offsets: RareByteOffsets,
byte1: u8,
byte2: u8,
byte3: u8,
}
#[cfg(feature = "perf-literal")]
impl PrefilterI for RareBytesThree {
fn find_in(&self, haystack: &[u8], span: Span) -> Candidate {
memchr::memchr3(self.byte1, self.byte2, self.byte3, &haystack[span])
.map(|i| {
let pos = span.start + i;
let offset = self.offsets.set[usize::from(haystack[pos])].max;
cmp::max(span.start, pos.saturating_sub(usize::from(offset)))
})
.map_or(Candidate::None, Candidate::PossibleStartOfMatch)
}
}
/// A builder for constructing a starting byte prefilter.
///
/// A starting byte prefilter is a simplistic prefilter that looks for possible
/// matches by reporting all positions corresponding to a particular byte. This
/// generally only takes affect when there are at most 3 distinct possible
/// starting bytes. e.g., the patterns `foo`, `bar`, and `baz` have two
/// distinct starting bytes (`f` and `b`), and this prefilter returns all
/// occurrences of either `f` or `b`.
///
/// In some cases, a heuristic frequency analysis may determine that it would
/// be better not to use this prefilter even when there are 3 or fewer distinct
/// starting bytes.
#[derive(Clone, Debug)]
struct StartBytesBuilder {
/// Whether this prefilter should account for ASCII case insensitivity or
/// not.
ascii_case_insensitive: bool,
/// The set of starting bytes observed.
byteset: Vec<bool>,
/// The number of bytes set to true in `byteset`.
count: usize,
/// The sum of frequency ranks for the rare bytes detected. This is
/// intended to give a heuristic notion of how rare the bytes are.
rank_sum: u16,
}
impl StartBytesBuilder {
/// Create a new builder for constructing a start byte prefilter.
fn new() -> StartBytesBuilder {
StartBytesBuilder {
ascii_case_insensitive: false,
byteset: vec![false; 256],
count: 0,
rank_sum: 0,
}
}
/// Enable ASCII case insensitivity. When set, byte strings added to this
/// builder will be interpreted without respect to ASCII case.
fn ascii_case_insensitive(mut self, yes: bool) -> StartBytesBuilder {
self.ascii_case_insensitive = yes;
self
}
/// Build the starting bytes prefilter.
///
/// If there are more than 3 distinct starting bytes, or if heuristics
/// otherwise determine that this prefilter should not be used, then `None`
/// is returned.
fn build(&self) -> Option<Prefilter> {
#[cfg(feature = "perf-literal")]
fn imp(builder: &StartBytesBuilder) -> Option<Prefilter> {
if builder.count > 3 {
return None;
}
let (mut bytes, mut len) = ([0; 3], 0);
for b in 0..256 {
if !builder.byteset[b] {
continue;
}
// We don't handle non-ASCII bytes for now. Getting non-ASCII
// bytes right is trickier, since we generally don't want to put
// a leading UTF-8 code unit into a prefilter that isn't ASCII,
// since they can frequently. Instead, it would be better to use a
// continuation byte, but this requires more sophisticated analysis
// of the automaton and a richer prefilter API.
if b > 0x7F {
return None;
}
bytes[len] = b as u8;
len += 1;
}
let finder: Arc<dyn PrefilterI> = match len {
0 => return None,
1 => Arc::new(StartBytesOne { byte1: bytes[0] }),
2 => Arc::new(StartBytesTwo {
byte1: bytes[0],
byte2: bytes[1],
}),
3 => Arc::new(StartBytesThree {
byte1: bytes[0],
byte2: bytes[1],
byte3: bytes[2],
}),
_ => unreachable!(),
};
Some(Prefilter { finder, memory_usage: 0 })
}
#[cfg(not(feature = "perf-literal"))]
fn imp(_: &StartBytesBuilder) -> Option<Prefilter> {
None
}
imp(self)
}
/// Add a byte string to this builder.
///
/// All patterns added to an Aho-Corasick automaton should be added to this
/// builder before attempting to construct the prefilter.
fn add(&mut self, bytes: &[u8]) {
if self.count > 3 {
return;
}
if let Some(&byte) = bytes.get(0) {
self.add_one_byte(byte);
if self.ascii_case_insensitive {
self.add_one_byte(opposite_ascii_case(byte));
}
}
}
fn add_one_byte(&mut self, byte: u8) {
if !self.byteset[byte as usize] {
self.byteset[byte as usize] = true;
self.count += 1;
self.rank_sum += freq_rank(byte) as u16;
}
}
}
/// A prefilter for scanning for a single starting byte.
#[cfg(feature = "perf-literal")]
#[derive(Clone, Debug)]
struct StartBytesOne {
byte1: u8,
}
#[cfg(feature = "perf-literal")]
impl PrefilterI for StartBytesOne {
fn find_in(&self, haystack: &[u8], span: Span) -> Candidate {
memchr::memchr(self.byte1, &haystack[span])
.map(|i| span.start + i)
.map_or(Candidate::None, Candidate::PossibleStartOfMatch)
}
}
/// A prefilter for scanning for two starting bytes.
#[cfg(feature = "perf-literal")]
#[derive(Clone, Debug)]
struct StartBytesTwo {
byte1: u8,
byte2: u8,
}
#[cfg(feature = "perf-literal")]
impl PrefilterI for StartBytesTwo {
fn find_in(&self, haystack: &[u8], span: Span) -> Candidate {
memchr::memchr2(self.byte1, self.byte2, &haystack[span])
.map(|i| span.start + i)
.map_or(Candidate::None, Candidate::PossibleStartOfMatch)
}
}
/// A prefilter for scanning for three starting bytes.
#[cfg(feature = "perf-literal")]
#[derive(Clone, Debug)]
struct StartBytesThree {
byte1: u8,
byte2: u8,
byte3: u8,
}
#[cfg(feature = "perf-literal")]
impl PrefilterI for StartBytesThree {
fn find_in(&self, haystack: &[u8], span: Span) -> Candidate {
memchr::memchr3(self.byte1, self.byte2, self.byte3, &haystack[span])
.map(|i| span.start + i)
.map_or(Candidate::None, Candidate::PossibleStartOfMatch)
}
}
/// If the given byte is an ASCII letter, then return it in the opposite case.
/// e.g., Given `b'A'`, this returns `b'a'`, and given `b'a'`, this returns
/// `b'A'`. If a non-ASCII letter is given, then the given byte is returned.
pub(crate) fn opposite_ascii_case(b: u8) -> u8 {
if b'A' <= b && b <= b'Z' {
b.to_ascii_lowercase()
} else if b'a' <= b && b <= b'z' {
b.to_ascii_uppercase()
} else {
b
}
}
/// Return the frequency rank of the given byte. The higher the rank, the more
/// common the byte (heuristically speaking).
fn freq_rank(b: u8) -> u8 {
use crate::util::byte_frequencies::BYTE_FREQUENCIES;
BYTE_FREQUENCIES[b as usize]
}

View File

@ -0,0 +1,759 @@
/*!
Lower level primitive types that are useful in a variety of circumstances.
# Overview
This list represents the principle types in this module and briefly describes
when you might want to use them.
* [`PatternID`] - A type that represents the identifier of a regex pattern.
This is probably the most widely used type in this module (which is why it's
also re-exported in the crate root).
* [`StateID`] - A type the represents the identifier of a finite automaton
state. This is used for both NFAs and DFAs, with the notable exception of
the hybrid NFA/DFA. (The hybrid NFA/DFA uses a special purpose "lazy" state
identifier.)
* [`SmallIndex`] - The internal representation of both a `PatternID` and a
`StateID`. Its purpose is to serve as a type that can index memory without
being as big as a `usize` on 64-bit targets. The main idea behind this type
is that there are many things in regex engines that will, in practice, never
overflow a 32-bit integer. (For example, like the number of patterns in a regex
or the number of states in an NFA.) Thus, a `SmallIndex` can be used to index
memory without peppering `as` casts everywhere. Moreover, it forces callers
to handle errors in the case where, somehow, the value would otherwise overflow
either a 32-bit integer or a `usize` (e.g., on 16-bit targets).
*/
// The macro we use to define some types below adds methods that we don't
// use on some of the types. There isn't much, so we just squash the warning.
#![allow(dead_code)]
use alloc::vec::Vec;
use crate::util::int::{Usize, U16, U32, U64};
/// A type that represents a "small" index.
///
/// The main idea of this type is to provide something that can index memory,
/// but uses less memory than `usize` on 64-bit systems. Specifically, its
/// representation is always a `u32` and has `repr(transparent)` enabled. (So
/// it is safe to transmute between a `u32` and a `SmallIndex`.)
///
/// A small index is typically useful in cases where there is no practical way
/// that the index will overflow a 32-bit integer. A good example of this is
/// an NFA state. If you could somehow build an NFA with `2^30` states, its
/// memory usage would be exorbitant and its runtime execution would be so
/// slow as to be completely worthless. Therefore, this crate generally deems
/// it acceptable to return an error if it would otherwise build an NFA that
/// requires a slice longer than what a 32-bit integer can index. In exchange,
/// we can use 32-bit indices instead of 64-bit indices in various places.
///
/// This type ensures this by providing a constructor that will return an error
/// if its argument cannot fit into the type. This makes it much easier to
/// handle these sorts of boundary cases that are otherwise extremely subtle.
///
/// On all targets, this type guarantees that its value will fit in a `u32`,
/// `i32`, `usize` and an `isize`. This means that on 16-bit targets, for
/// example, this type's maximum value will never overflow an `isize`,
/// which means it will never overflow a `i16` even though its internal
/// representation is still a `u32`.
///
/// The purpose for making the type fit into even signed integer types like
/// `isize` is to guarantee that the difference between any two small indices
/// is itself also a small index. This is useful in certain contexts, e.g.,
/// for delta encoding.
///
/// # Other types
///
/// The following types wrap `SmallIndex` to provide a more focused use case:
///
/// * [`PatternID`] is for representing the identifiers of patterns.
/// * [`StateID`] is for representing the identifiers of states in finite
/// automata. It is used for both NFAs and DFAs.
///
/// # Representation
///
/// This type is always represented internally by a `u32` and is marked as
/// `repr(transparent)`. Thus, this type always has the same representation as
/// a `u32`. It is thus safe to transmute between a `u32` and a `SmallIndex`.
///
/// # Indexing
///
/// For convenience, callers may use a `SmallIndex` to index slices.
///
/// # Safety
///
/// While a `SmallIndex` is meant to guarantee that its value fits into `usize`
/// without using as much space as a `usize` on all targets, callers must
/// not rely on this property for safety. Callers may choose to rely on this
/// property for correctness however. For example, creating a `SmallIndex` with
/// an invalid value can be done in entirely safe code. This may in turn result
/// in panics or silent logical errors.
#[derive(
Clone, Copy, Debug, Default, Eq, Hash, PartialEq, PartialOrd, Ord,
)]
#[repr(transparent)]
pub(crate) struct SmallIndex(u32);
impl SmallIndex {
/// The maximum index value.
#[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))]
pub const MAX: SmallIndex =
// FIXME: Use as_usize() once const functions in traits are stable.
SmallIndex::new_unchecked(core::i32::MAX as usize - 1);
/// The maximum index value.
#[cfg(target_pointer_width = "16")]
pub const MAX: SmallIndex =
SmallIndex::new_unchecked(core::isize::MAX - 1);
/// The total number of values that can be represented as a small index.
pub const LIMIT: usize = SmallIndex::MAX.as_usize() + 1;
/// The zero index value.
pub const ZERO: SmallIndex = SmallIndex::new_unchecked(0);
/// The number of bytes that a single small index uses in memory.
pub const SIZE: usize = core::mem::size_of::<SmallIndex>();
/// Create a new small index.
///
/// If the given index exceeds [`SmallIndex::MAX`], then this returns
/// an error.
#[inline]
pub fn new(index: usize) -> Result<SmallIndex, SmallIndexError> {
SmallIndex::try_from(index)
}
/// Create a new small index without checking whether the given value
/// exceeds [`SmallIndex::MAX`].
///
/// Using this routine with an invalid index value will result in
/// unspecified behavior, but *not* undefined behavior. In particular, an
/// invalid index value is likely to cause panics or possibly even silent
/// logical errors.
///
/// Callers must never rely on a `SmallIndex` to be within a certain range
/// for memory safety.
#[inline]
pub const fn new_unchecked(index: usize) -> SmallIndex {
// FIXME: Use as_u32() once const functions in traits are stable.
SmallIndex::from_u32_unchecked(index as u32)
}
/// Create a new small index from a `u32` without checking whether the
/// given value exceeds [`SmallIndex::MAX`].
///
/// Using this routine with an invalid index value will result in
/// unspecified behavior, but *not* undefined behavior. In particular, an
/// invalid index value is likely to cause panics or possibly even silent
/// logical errors.
///
/// Callers must never rely on a `SmallIndex` to be within a certain range
/// for memory safety.
#[inline]
pub const fn from_u32_unchecked(index: u32) -> SmallIndex {
SmallIndex(index)
}
/// Like [`SmallIndex::new`], but panics if the given index is not valid.
#[inline]
pub fn must(index: usize) -> SmallIndex {
SmallIndex::new(index).expect("invalid small index")
}
/// Return this small index as a `usize`. This is guaranteed to never
/// overflow `usize`.
#[inline]
pub const fn as_usize(&self) -> usize {
// FIXME: Use as_usize() once const functions in traits are stable.
self.0 as usize
}
/// Return this small index as a `u64`. This is guaranteed to never
/// overflow.
#[inline]
pub const fn as_u64(&self) -> u64 {
// FIXME: Use u64::from() once const functions in traits are stable.
self.0 as u64
}
/// Return the internal `u32` of this small index. This is guaranteed to
/// never overflow `u32`.
#[inline]
pub const fn as_u32(&self) -> u32 {
self.0
}
/// Return the internal `u32` of this small index represented as an `i32`.
/// This is guaranteed to never overflow an `i32`.
#[inline]
pub const fn as_i32(&self) -> i32 {
// This is OK because we guarantee that our max value is <= i32::MAX.
self.0 as i32
}
/// Returns one more than this small index as a usize.
///
/// Since a small index has constraints on its maximum value, adding `1` to
/// it will always fit in a `usize`, `isize`, `u32` and a `i32`.
#[inline]
pub fn one_more(&self) -> usize {
self.as_usize() + 1
}
/// Decode this small index from the bytes given using the native endian
/// byte order for the current target.
///
/// If the decoded integer is not representable as a small index for the
/// current target, then this returns an error.
#[inline]
pub fn from_ne_bytes(
bytes: [u8; 4],
) -> Result<SmallIndex, SmallIndexError> {
let id = u32::from_ne_bytes(bytes);
if id > SmallIndex::MAX.as_u32() {
return Err(SmallIndexError { attempted: u64::from(id) });
}
Ok(SmallIndex::new_unchecked(id.as_usize()))
}
/// Decode this small index from the bytes given using the native endian
/// byte order for the current target.
///
/// This is analogous to [`SmallIndex::new_unchecked`] in that is does not
/// check whether the decoded integer is representable as a small index.
#[inline]
pub fn from_ne_bytes_unchecked(bytes: [u8; 4]) -> SmallIndex {
SmallIndex::new_unchecked(u32::from_ne_bytes(bytes).as_usize())
}
/// Return the underlying small index integer as raw bytes in native endian
/// format.
#[inline]
pub fn to_ne_bytes(&self) -> [u8; 4] {
self.0.to_ne_bytes()
}
}
impl<T> core::ops::Index<SmallIndex> for [T] {
type Output = T;
#[inline]
fn index(&self, index: SmallIndex) -> &T {
&self[index.as_usize()]
}
}
impl<T> core::ops::IndexMut<SmallIndex> for [T] {
#[inline]
fn index_mut(&mut self, index: SmallIndex) -> &mut T {
&mut self[index.as_usize()]
}
}
impl<T> core::ops::Index<SmallIndex> for Vec<T> {
type Output = T;
#[inline]
fn index(&self, index: SmallIndex) -> &T {
&self[index.as_usize()]
}
}
impl<T> core::ops::IndexMut<SmallIndex> for Vec<T> {
#[inline]
fn index_mut(&mut self, index: SmallIndex) -> &mut T {
&mut self[index.as_usize()]
}
}
impl From<StateID> for SmallIndex {
fn from(sid: StateID) -> SmallIndex {
sid.0
}
}
impl From<PatternID> for SmallIndex {
fn from(pid: PatternID) -> SmallIndex {
pid.0
}
}
impl From<u8> for SmallIndex {
fn from(index: u8) -> SmallIndex {
SmallIndex::new_unchecked(usize::from(index))
}
}
impl TryFrom<u16> for SmallIndex {
type Error = SmallIndexError;
fn try_from(index: u16) -> Result<SmallIndex, SmallIndexError> {
if u32::from(index) > SmallIndex::MAX.as_u32() {
return Err(SmallIndexError { attempted: u64::from(index) });
}
Ok(SmallIndex::new_unchecked(index.as_usize()))
}
}
impl TryFrom<u32> for SmallIndex {
type Error = SmallIndexError;
fn try_from(index: u32) -> Result<SmallIndex, SmallIndexError> {
if index > SmallIndex::MAX.as_u32() {
return Err(SmallIndexError { attempted: u64::from(index) });
}
Ok(SmallIndex::new_unchecked(index.as_usize()))
}
}
impl TryFrom<u64> for SmallIndex {
type Error = SmallIndexError;
fn try_from(index: u64) -> Result<SmallIndex, SmallIndexError> {
if index > SmallIndex::MAX.as_u64() {
return Err(SmallIndexError { attempted: index });
}
Ok(SmallIndex::new_unchecked(index.as_usize()))
}
}
impl TryFrom<usize> for SmallIndex {
type Error = SmallIndexError;
fn try_from(index: usize) -> Result<SmallIndex, SmallIndexError> {
if index > SmallIndex::MAX.as_usize() {
return Err(SmallIndexError { attempted: index.as_u64() });
}
Ok(SmallIndex::new_unchecked(index))
}
}
/// This error occurs when a small index could not be constructed.
///
/// This occurs when given an integer exceeding the maximum small index value.
///
/// When the `std` feature is enabled, this implements the `Error` trait.
#[derive(Clone, Debug, Eq, PartialEq)]
pub struct SmallIndexError {
attempted: u64,
}
impl SmallIndexError {
/// Returns the value that could not be converted to a small index.
pub fn attempted(&self) -> u64 {
self.attempted
}
}
#[cfg(feature = "std")]
impl std::error::Error for SmallIndexError {}
impl core::fmt::Display for SmallIndexError {
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
write!(
f,
"failed to create small index from {:?}, which exceeds {:?}",
self.attempted(),
SmallIndex::MAX,
)
}
}
#[derive(Clone, Debug)]
pub(crate) struct SmallIndexIter {
rng: core::ops::Range<usize>,
}
impl Iterator for SmallIndexIter {
type Item = SmallIndex;
fn next(&mut self) -> Option<SmallIndex> {
if self.rng.start >= self.rng.end {
return None;
}
let next_id = self.rng.start + 1;
let id = core::mem::replace(&mut self.rng.start, next_id);
// new_unchecked is OK since we asserted that the number of
// elements in this iterator will fit in an ID at construction.
Some(SmallIndex::new_unchecked(id))
}
}
macro_rules! index_type_impls {
($name:ident, $err:ident, $iter:ident, $withiter:ident) => {
impl $name {
/// The maximum value.
pub const MAX: $name = $name(SmallIndex::MAX);
/// The total number of values that can be represented.
pub const LIMIT: usize = SmallIndex::LIMIT;
/// The zero value.
pub const ZERO: $name = $name(SmallIndex::ZERO);
/// The number of bytes that a single value uses in memory.
pub const SIZE: usize = SmallIndex::SIZE;
/// Create a new value that is represented by a "small index."
///
/// If the given index exceeds the maximum allowed value, then this
/// returns an error.
#[inline]
pub fn new(value: usize) -> Result<$name, $err> {
SmallIndex::new(value).map($name).map_err($err)
}
/// Create a new value without checking whether the given argument
/// exceeds the maximum.
///
/// Using this routine with an invalid value will result in
/// unspecified behavior, but *not* undefined behavior. In
/// particular, an invalid ID value is likely to cause panics or
/// possibly even silent logical errors.
///
/// Callers must never rely on this type to be within a certain
/// range for memory safety.
#[inline]
pub const fn new_unchecked(value: usize) -> $name {
$name(SmallIndex::new_unchecked(value))
}
/// Create a new value from a `u32` without checking whether the
/// given value exceeds the maximum.
///
/// Using this routine with an invalid value will result in
/// unspecified behavior, but *not* undefined behavior. In
/// particular, an invalid ID value is likely to cause panics or
/// possibly even silent logical errors.
///
/// Callers must never rely on this type to be within a certain
/// range for memory safety.
#[inline]
pub const fn from_u32_unchecked(index: u32) -> $name {
$name(SmallIndex::from_u32_unchecked(index))
}
/// Like `new`, but panics if the given value is not valid.
#[inline]
pub fn must(value: usize) -> $name {
$name::new(value).expect(concat!(
"invalid ",
stringify!($name),
" value"
))
}
/// Return the internal value as a `usize`. This is guaranteed to
/// never overflow `usize`.
#[inline]
pub const fn as_usize(&self) -> usize {
self.0.as_usize()
}
/// Return the internal value as a `u64`. This is guaranteed to
/// never overflow.
#[inline]
pub const fn as_u64(&self) -> u64 {
self.0.as_u64()
}
/// Return the internal value as a `u32`. This is guaranteed to
/// never overflow `u32`.
#[inline]
pub const fn as_u32(&self) -> u32 {
self.0.as_u32()
}
/// Return the internal value as a `i32`. This is guaranteed to
/// never overflow an `i32`.
#[inline]
pub const fn as_i32(&self) -> i32 {
self.0.as_i32()
}
/// Returns one more than this value as a usize.
///
/// Since values represented by a "small index" have constraints
/// on their maximum value, adding `1` to it will always fit in a
/// `usize`, `u32` and a `i32`.
#[inline]
pub fn one_more(&self) -> usize {
self.0.one_more()
}
/// Decode this value from the bytes given using the native endian
/// byte order for the current target.
///
/// If the decoded integer is not representable as a small index
/// for the current target, then this returns an error.
#[inline]
pub fn from_ne_bytes(bytes: [u8; 4]) -> Result<$name, $err> {
SmallIndex::from_ne_bytes(bytes).map($name).map_err($err)
}
/// Decode this value from the bytes given using the native endian
/// byte order for the current target.
///
/// This is analogous to `new_unchecked` in that is does not check
/// whether the decoded integer is representable as a small index.
#[inline]
pub fn from_ne_bytes_unchecked(bytes: [u8; 4]) -> $name {
$name(SmallIndex::from_ne_bytes_unchecked(bytes))
}
/// Return the underlying integer as raw bytes in native endian
/// format.
#[inline]
pub fn to_ne_bytes(&self) -> [u8; 4] {
self.0.to_ne_bytes()
}
/// Returns an iterator over all values from 0 up to and not
/// including the given length.
///
/// If the given length exceeds this type's limit, then this
/// panics.
pub(crate) fn iter(len: usize) -> $iter {
$iter::new(len)
}
}
// We write our own Debug impl so that we get things like PatternID(5)
// instead of PatternID(SmallIndex(5)).
impl core::fmt::Debug for $name {
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
f.debug_tuple(stringify!($name)).field(&self.as_u32()).finish()
}
}
impl<T> core::ops::Index<$name> for [T] {
type Output = T;
#[inline]
fn index(&self, index: $name) -> &T {
&self[index.as_usize()]
}
}
impl<T> core::ops::IndexMut<$name> for [T] {
#[inline]
fn index_mut(&mut self, index: $name) -> &mut T {
&mut self[index.as_usize()]
}
}
impl<T> core::ops::Index<$name> for Vec<T> {
type Output = T;
#[inline]
fn index(&self, index: $name) -> &T {
&self[index.as_usize()]
}
}
impl<T> core::ops::IndexMut<$name> for Vec<T> {
#[inline]
fn index_mut(&mut self, index: $name) -> &mut T {
&mut self[index.as_usize()]
}
}
impl From<SmallIndex> for $name {
fn from(index: SmallIndex) -> $name {
$name(index)
}
}
impl From<u8> for $name {
fn from(value: u8) -> $name {
$name(SmallIndex::from(value))
}
}
impl TryFrom<u16> for $name {
type Error = $err;
fn try_from(value: u16) -> Result<$name, $err> {
SmallIndex::try_from(value).map($name).map_err($err)
}
}
impl TryFrom<u32> for $name {
type Error = $err;
fn try_from(value: u32) -> Result<$name, $err> {
SmallIndex::try_from(value).map($name).map_err($err)
}
}
impl TryFrom<u64> for $name {
type Error = $err;
fn try_from(value: u64) -> Result<$name, $err> {
SmallIndex::try_from(value).map($name).map_err($err)
}
}
impl TryFrom<usize> for $name {
type Error = $err;
fn try_from(value: usize) -> Result<$name, $err> {
SmallIndex::try_from(value).map($name).map_err($err)
}
}
/// This error occurs when an ID could not be constructed.
///
/// This occurs when given an integer exceeding the maximum allowed
/// value.
///
/// When the `std` feature is enabled, this implements the `Error`
/// trait.
#[derive(Clone, Debug, Eq, PartialEq)]
pub struct $err(SmallIndexError);
impl $err {
/// Returns the value that could not be converted to an ID.
pub fn attempted(&self) -> u64 {
self.0.attempted()
}
}
#[cfg(feature = "std")]
impl std::error::Error for $err {}
impl core::fmt::Display for $err {
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
write!(
f,
"failed to create {} from {:?}, which exceeds {:?}",
stringify!($name),
self.attempted(),
$name::MAX,
)
}
}
#[derive(Clone, Debug)]
pub(crate) struct $iter(SmallIndexIter);
impl $iter {
fn new(len: usize) -> $iter {
assert!(
len <= $name::LIMIT,
"cannot create iterator for {} when number of \
elements exceed {:?}",
stringify!($name),
$name::LIMIT,
);
$iter(SmallIndexIter { rng: 0..len })
}
}
impl Iterator for $iter {
type Item = $name;
fn next(&mut self) -> Option<$name> {
self.0.next().map($name)
}
}
/// An iterator adapter that is like std::iter::Enumerate, but attaches
/// small index values instead. It requires `ExactSizeIterator`. At
/// construction, it ensures that the index of each element in the
/// iterator is representable in the corresponding small index type.
#[derive(Clone, Debug)]
pub(crate) struct $withiter<I> {
it: I,
ids: $iter,
}
impl<I: Iterator + ExactSizeIterator> $withiter<I> {
fn new(it: I) -> $withiter<I> {
let ids = $name::iter(it.len());
$withiter { it, ids }
}
}
impl<I: Iterator + ExactSizeIterator> Iterator for $withiter<I> {
type Item = ($name, I::Item);
fn next(&mut self) -> Option<($name, I::Item)> {
let item = self.it.next()?;
// Number of elements in this iterator must match, according
// to contract of ExactSizeIterator.
let id = self.ids.next().unwrap();
Some((id, item))
}
}
};
}
/// The identifier of a pattern in an Aho-Corasick automaton.
///
/// It is represented by a `u32` even on 64-bit systems in order to conserve
/// space. Namely, on all targets, this type guarantees that its value will
/// fit in a `u32`, `i32`, `usize` and an `isize`. This means that on 16-bit
/// targets, for example, this type's maximum value will never overflow an
/// `isize`, which means it will never overflow a `i16` even though its
/// internal representation is still a `u32`.
///
/// # Safety
///
/// While a `PatternID` is meant to guarantee that its value fits into `usize`
/// without using as much space as a `usize` on all targets, callers must
/// not rely on this property for safety. Callers may choose to rely on this
/// property for correctness however. For example, creating a `StateID` with an
/// invalid value can be done in entirely safe code. This may in turn result in
/// panics or silent logical errors.
#[derive(Clone, Copy, Default, Eq, Hash, PartialEq, PartialOrd, Ord)]
#[repr(transparent)]
pub struct PatternID(SmallIndex);
/// The identifier of a finite automaton state.
///
/// It is represented by a `u32` even on 64-bit systems in order to conserve
/// space. Namely, on all targets, this type guarantees that its value will
/// fit in a `u32`, `i32`, `usize` and an `isize`. This means that on 16-bit
/// targets, for example, this type's maximum value will never overflow an
/// `isize`, which means it will never overflow a `i16` even though its
/// internal representation is still a `u32`.
///
/// # Safety
///
/// While a `StateID` is meant to guarantee that its value fits into `usize`
/// without using as much space as a `usize` on all targets, callers must
/// not rely on this property for safety. Callers may choose to rely on this
/// property for correctness however. For example, creating a `StateID` with an
/// invalid value can be done in entirely safe code. This may in turn result in
/// panics or silent logical errors.
#[derive(Clone, Copy, Default, Eq, Hash, PartialEq, PartialOrd, Ord)]
#[repr(transparent)]
pub struct StateID(SmallIndex);
index_type_impls!(PatternID, PatternIDError, PatternIDIter, WithPatternIDIter);
index_type_impls!(StateID, StateIDError, StateIDIter, WithStateIDIter);
/// A utility trait that defines a couple of adapters for making it convenient
/// to access indices as "small index" types. We require ExactSizeIterator so
/// that iterator construction can do a single check to make sure the index of
/// each element is representable by its small index type.
pub(crate) trait IteratorIndexExt: Iterator {
fn with_pattern_ids(self) -> WithPatternIDIter<Self>
where
Self: Sized + ExactSizeIterator,
{
WithPatternIDIter::new(self)
}
fn with_state_ids(self) -> WithStateIDIter<Self>
where
Self: Sized + ExactSizeIterator,
{
WithStateIDIter::new(self)
}
}
impl<I: Iterator> IteratorIndexExt for I {}

214
vendor/aho-corasick/src/util/remapper.rs vendored Normal file
View File

@ -0,0 +1,214 @@
use alloc::vec::Vec;
use crate::{nfa::noncontiguous, util::primitives::StateID};
/// Remappable is a tightly coupled abstraction that facilitates remapping
/// state identifiers in DFAs.
///
/// The main idea behind remapping state IDs is that DFAs often need to check
/// if a certain state is a "special" state of some kind (like a match state)
/// during a search. Since this is extremely perf critical code, we want this
/// check to be as fast as possible. Partitioning state IDs into, for example,
/// into "non-match" and "match" states means one can tell if a state is a
/// match state via a simple comparison of the state ID.
///
/// The issue is that during the DFA construction process, it's not
/// particularly easy to partition the states. Instead, the simplest thing is
/// to often just do a pass over all of the states and shuffle them into their
/// desired partitionings. To do that, we need a mechanism for swapping states.
/// Hence, this abstraction.
///
/// Normally, for such little code, I would just duplicate it. But this is a
/// key optimization and the implementation is a bit subtle. So the abstraction
/// is basically a ham-fisted attempt at DRY. The only place we use this is in
/// the dense and one-pass DFAs.
///
/// See also src/dfa/special.rs for a more detailed explanation of how dense
/// DFAs are partitioned.
pub(crate) trait Remappable: core::fmt::Debug {
/// Return the total number of states.
fn state_len(&self) -> usize;
/// Swap the states pointed to by the given IDs. The underlying finite
/// state machine should be mutated such that all of the transitions in
/// `id1` are now in the memory region where the transitions for `id2`
/// were, and all of the transitions in `id2` are now in the memory region
/// where the transitions for `id1` were.
///
/// Essentially, this "moves" `id1` to `id2` and `id2` to `id1`.
///
/// It is expected that, after calling this, the underlying state machine
/// will be left in an inconsistent state, since any other transitions
/// pointing to, e.g., `id1` need to be updated to point to `id2`, since
/// that's where `id1` moved to.
///
/// In order to "fix" the underlying inconsistent state, a `Remapper`
/// should be used to guarantee that `remap` is called at the appropriate
/// time.
fn swap_states(&mut self, id1: StateID, id2: StateID);
/// This must remap every single state ID in the underlying value according
/// to the function given. For example, in a DFA, this should remap every
/// transition and every starting state ID.
fn remap(&mut self, map: impl Fn(StateID) -> StateID);
}
/// Remapper is an abstraction the manages the remapping of state IDs in a
/// finite state machine. This is useful when one wants to shuffle states into
/// different positions in the machine.
///
/// One of the key complexities this manages is the ability to correctly move
/// one state multiple times.
///
/// Once shuffling is complete, `remap` must be called, which will rewrite
/// all pertinent transitions to updated state IDs. Neglecting to call `remap`
/// will almost certainly result in a corrupt machine.
#[derive(Debug)]
pub(crate) struct Remapper {
/// A map from the index of a state to its pre-multiplied identifier.
///
/// When a state is swapped with another, then their corresponding
/// locations in this map are also swapped. Thus, its new position will
/// still point to its old pre-multiplied StateID.
///
/// While there is a bit more to it, this then allows us to rewrite the
/// state IDs in a DFA's transition table in a single pass. This is done
/// by iterating over every ID in this map, then iterating over each
/// transition for the state at that ID and re-mapping the transition from
/// `old_id` to `map[dfa.to_index(old_id)]`. That is, we find the position
/// in this map where `old_id` *started*, and set it to where it ended up
/// after all swaps have been completed.
map: Vec<StateID>,
/// A way to map indices to state IDs (and back).
idx: IndexMapper,
}
impl Remapper {
/// Create a new remapper from the given remappable implementation. The
/// remapper can then be used to swap states. The remappable value given
/// here must the same one given to `swap` and `remap`.
///
/// The given stride should be the stride of the transition table expressed
/// as a power of 2. This stride is used to map between state IDs and state
/// indices. If state IDs and state indices are equivalent, then provide
/// a `stride2` of `0`, which acts as an identity.
pub(crate) fn new(r: &impl Remappable, stride2: usize) -> Remapper {
let idx = IndexMapper { stride2 };
let map = (0..r.state_len()).map(|i| idx.to_state_id(i)).collect();
Remapper { map, idx }
}
/// Swap two states. Once this is called, callers must follow through to
/// call `remap`, or else it's possible for the underlying remappable
/// value to be in a corrupt state.
pub(crate) fn swap(
&mut self,
r: &mut impl Remappable,
id1: StateID,
id2: StateID,
) {
if id1 == id2 {
return;
}
r.swap_states(id1, id2);
self.map.swap(self.idx.to_index(id1), self.idx.to_index(id2));
}
/// Complete the remapping process by rewriting all state IDs in the
/// remappable value according to the swaps performed.
pub(crate) fn remap(mut self, r: &mut impl Remappable) {
// Update the map to account for states that have been swapped
// multiple times. For example, if (A, C) and (C, G) are swapped, then
// transitions previously pointing to A should now point to G. But if
// we don't update our map, they will erroneously be set to C. All we
// do is follow the swaps in our map until we see our original state
// ID.
//
// The intuition here is to think about how changes are made to the
// map: only through pairwise swaps. That means that starting at any
// given state, it is always possible to find the loop back to that
// state by following the swaps represented in the map (which might be
// 0 swaps).
//
// We are also careful to clone the map before starting in order to
// freeze it. We use the frozen map to find our loops, since we need to
// update our map as well. Without freezing it, our updates could break
// the loops referenced above and produce incorrect results.
let oldmap = self.map.clone();
for i in 0..r.state_len() {
let cur_id = self.idx.to_state_id(i);
let mut new_id = oldmap[i];
if cur_id == new_id {
continue;
}
loop {
let id = oldmap[self.idx.to_index(new_id)];
if cur_id == id {
self.map[i] = new_id;
break;
}
new_id = id;
}
}
r.remap(|sid| self.map[self.idx.to_index(sid)]);
}
}
/// A simple type for mapping between state indices and state IDs.
///
/// The reason why this exists is because state IDs are "premultiplied" in a
/// DFA. That is, in order to get to the transitions for a particular state,
/// one need only use the state ID as-is, instead of having to multiply it by
/// transition table's stride.
///
/// The downside of this is that it's inconvenient to map between state IDs
/// using a dense map, e.g., Vec<StateID>. That's because state IDs look like
/// `0`, `stride`, `2*stride`, `3*stride`, etc., instead of `0`, `1`, `2`, `3`,
/// etc.
///
/// Since our state IDs are premultiplied, we can convert back-and-forth
/// between IDs and indices by simply unmultiplying the IDs and multiplying the
/// indices.
///
/// Note that for a sparse NFA, state IDs and indices are equivalent. In this
/// case, we set the stride of the index mapped to be `0`, which acts as an
/// identity.
#[derive(Debug)]
struct IndexMapper {
/// The power of 2 corresponding to the stride of the corresponding
/// transition table. 'id >> stride2' de-multiplies an ID while 'index <<
/// stride2' pre-multiplies an index to an ID.
stride2: usize,
}
impl IndexMapper {
/// Convert a state ID to a state index.
fn to_index(&self, id: StateID) -> usize {
id.as_usize() >> self.stride2
}
/// Convert a state index to a state ID.
fn to_state_id(&self, index: usize) -> StateID {
// CORRECTNESS: If the given index is not valid, then it is not
// required for this to panic or return a valid state ID. We'll "just"
// wind up with panics or silent logic errors at some other point. But
// this is OK because if Remappable::state_len is correct and so is
// 'to_index', then all inputs to 'to_state_id' should be valid indices
// and thus transform into valid state IDs.
StateID::new_unchecked(index << self.stride2)
}
}
impl Remappable for noncontiguous::NFA {
fn state_len(&self) -> usize {
noncontiguous::NFA::states(self).len()
}
fn swap_states(&mut self, id1: StateID, id2: StateID) {
noncontiguous::NFA::swap_states(self, id1, id2)
}
fn remap(&mut self, map: impl Fn(StateID) -> StateID) {
noncontiguous::NFA::remap(self, map)
}
}

1148
vendor/aho-corasick/src/util/search.rs vendored Normal file

File diff suppressed because it is too large Load Diff

42
vendor/aho-corasick/src/util/special.rs vendored Normal file
View File

@ -0,0 +1,42 @@
use crate::util::primitives::StateID;
/// A collection of sentinel state IDs for Aho-Corasick automata.
///
/// This specifically enables the technique by which we determine which states
/// are dead, matches or start states. Namely, by arranging states in a
/// particular order, we can determine the type of a state simply by looking at
/// its ID.
#[derive(Clone, Debug)]
pub(crate) struct Special {
/// The maximum ID of all the "special" states. This corresponds either to
/// start_anchored_id when a prefilter is active and max_match_id when a
/// prefilter is not active. The idea here is that if there is no prefilter,
/// then there is no point in treating start states as special.
pub(crate) max_special_id: StateID,
/// The maximum ID of all the match states. Any state ID bigger than this
/// is guaranteed to be a non-match ID.
///
/// It is possible and legal for max_match_id to be equal to
/// start_anchored_id, which occurs precisely in the case where the empty
/// string is a pattern that was added to the underlying automaton.
pub(crate) max_match_id: StateID,
/// The state ID of the start state used for unanchored searches.
pub(crate) start_unanchored_id: StateID,
/// The state ID of the start state used for anchored searches. This is
/// always start_unanchored_id+1.
pub(crate) start_anchored_id: StateID,
}
impl Special {
/// Create a new set of "special" state IDs with all IDs initialized to
/// zero. The general idea here is that they will be updated and set to
/// correct values later.
pub(crate) fn zero() -> Special {
Special {
max_special_id: StateID::ZERO,
max_match_id: StateID::ZERO,
start_unanchored_id: StateID::ZERO,
start_anchored_id: StateID::ZERO,
}
}
}

View File

@ -0,0 +1 @@
{"files":{"Cargo.toml":"a87d9acc9827a50c7a96a88720c5dd055cbc08b1144dff95bd572ff977d4a79a","LICENSE-APACHE":"4458503dd48e88c4e0b945fb252a08b93c40ec757309b8ffa7c594dfa1e35104","LICENSE-MIT":"002c2696d92b5c8cf956c11072baa58eaf9f6ade995c031ea635c6a1ee342ad1","README.md":"6dfe0c602dc61eebe118900ed66a2c1f7887b9fe95b36e1c2974c4e8fa7ebd4b","src/lib.rs":"8f421233df83f82e737930ca8a2ad254966334183148bcc170f9c405df230de2","src/tzdata.rs":"78920925b04219910511e9a1f036f468cd2925c0054f280d6a00b106529046e7"},"package":"e999941b234f3131b00bc13c22d06e8c5ff726d1b6318ac7eb276997bbb4fef0"}

34
vendor/android-tzdata/Cargo.toml vendored Normal file
View File

@ -0,0 +1,34 @@
# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO
#
# When uploading crates to the registry Cargo will automatically
# "normalize" Cargo.toml files for maximal compatibility
# with all versions of Cargo and also rewrite `path` dependencies
# to registry (e.g., crates.io) dependencies.
#
# If you are reading this file be aware that the original Cargo.toml
# will likely look very different (and much more reasonable).
# See Cargo.toml.orig for the original contents.
[package]
edition = "2018"
name = "android-tzdata"
version = "0.1.1"
authors = ["RumovZ"]
include = [
"src/**/*",
"LICENSE-*",
"README.md",
]
description = "Parser for the Android-specific tzdata file"
readme = "README.md"
keywords = [
"parser",
"android",
"timezone",
]
categories = ["date-and-time"]
license = "MIT OR Apache-2.0"
repository = "https://github.com/RumovZ/android-tzdata"
[dev-dependencies.zip]
version = "0.6.4"

201
vendor/android-tzdata/LICENSE-APACHE vendored Normal file
View File

@ -0,0 +1,201 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

21
vendor/android-tzdata/LICENSE-MIT vendored Normal file
View File

@ -0,0 +1,21 @@
MIT License
Copyright (c) [year] [fullname]
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

20
vendor/android-tzdata/README.md vendored Normal file
View File

@ -0,0 +1,20 @@
# android-tzdata
Parser for the Android-specific tzdata file.
## License
Licensed under either of
- Apache License, Version 2.0
([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
- MIT license
([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
at your option.
## Contribution
Unless you explicitly state otherwise, any contribution intentionally submitted
for inclusion in the work by you, as defined in the Apache-2.0 license, shall be
dual licensed as above, without any additional terms or conditions.

29
vendor/android-tzdata/src/lib.rs vendored Normal file
View File

@ -0,0 +1,29 @@
//! Parser for the Android-specific tzdata file.
mod tzdata;
/// Tries to locate the `tzdata` file, parse it, and return the entry for the
/// requested time zone.
///
/// # Errors
///
/// Returns an [std::io::Error] if the `tzdata` file cannot be found and parsed, or
/// if it does not contain the requested timezone entry.
///
/// # Example
///
/// ```rust
/// # use std::error::Error;
/// # use android_tzdata::find_tz_data;
/// #
/// # fn main() -> Result<(), Box<dyn Error>> {
/// let tz_data = find_tz_data("Europe/Kiev")?;
/// // Check it's version 2 of the [Time Zone Information Format](https://www.ietf.org/archive/id/draft-murchison-rfc8536bis-02.html).
/// assert!(tz_data.starts_with(b"TZif2"));
/// # Ok(())
/// # }
/// ```
pub fn find_tz_data(tz_name: impl AsRef<str>) -> Result<Vec<u8>, std::io::Error> {
let mut file = tzdata::find_file()?;
tzdata::find_tz_data_in_file(&mut file, tz_name.as_ref())
}

166
vendor/android-tzdata/src/tzdata.rs vendored Normal file
View File

@ -0,0 +1,166 @@
//! Logic was mainly ported from https://android.googlesource.com/platform/libcore/+/jb-mr2-release/luni/src/main/java/libcore/util/ZoneInfoDB.java
use core::{cmp::Ordering, convert::TryInto};
use std::{
fs::File,
io::{self, ErrorKind, Read, Seek, SeekFrom},
};
// The database uses 32-bit (4 byte) integers.
const TZ_INT_SIZE: usize = 4;
// The first 12 bytes contain a special version string.
const MAGIC_SIZE: usize = 12;
const HEADER_SIZE: usize = MAGIC_SIZE + 3 * TZ_INT_SIZE;
// The database reserves 40 bytes for each id.
const TZ_NAME_SIZE: usize = 40;
const INDEX_ENTRY_SIZE: usize = TZ_NAME_SIZE + 3 * TZ_INT_SIZE;
const TZDATA_LOCATIONS: [TzdataLocation; 2] = [
TzdataLocation {
env_var: "ANDROID_DATA",
path: "/misc/zoneinfo/",
},
TzdataLocation {
env_var: "ANDROID_ROOT",
path: "/usr/share/zoneinfo/",
},
];
#[derive(Debug)]
struct TzdataLocation {
env_var: &'static str,
path: &'static str,
}
#[derive(Debug, Clone, Copy)]
struct Header {
index_offset: usize,
data_offset: usize,
_zonetab_offset: usize,
}
#[derive(Debug)]
struct Index(Vec<u8>);
#[derive(Debug, Clone, Copy)]
struct IndexEntry<'a> {
_name: &'a [u8],
offset: usize,
length: usize,
_raw_utc_offset: usize,
}
pub(super) fn find_file() -> Result<File, io::Error> {
for location in &TZDATA_LOCATIONS {
if let Ok(env_value) = std::env::var(location.env_var) {
if let Ok(file) = File::open(format!("{}{}tzdata", env_value, location.path)) {
return Ok(file);
}
}
}
Err(io::Error::from(io::ErrorKind::NotFound))
}
pub(super) fn find_tz_data_in_file(
mut file: impl Read + Seek,
tz_name: &str,
) -> Result<Vec<u8>, io::Error> {
let header = Header::new(&mut file)?;
let index = Index::new(&mut file, header)?;
if let Some(entry) = index.find_entry(tz_name) {
file.seek(SeekFrom::Start((entry.offset + header.data_offset) as u64))?;
let mut tz_data = vec![0u8; entry.length];
file.read_exact(&mut tz_data)?;
Ok(tz_data)
} else {
Err(io::Error::from(ErrorKind::NotFound))
}
}
impl Header {
fn new(mut file: impl Read + Seek) -> Result<Self, io::Error> {
let mut buf = [0; HEADER_SIZE];
file.read_exact(&mut buf)?;
if !buf.starts_with(b"tzdata") || buf[MAGIC_SIZE - 1] != 0u8 {
return Err(io::Error::new(
io::ErrorKind::InvalidData,
"invalid magic number",
));
}
Ok(Self {
index_offset: parse_tz_int(&buf, MAGIC_SIZE) as usize,
data_offset: parse_tz_int(&buf, MAGIC_SIZE + TZ_INT_SIZE) as usize,
_zonetab_offset: parse_tz_int(&buf, MAGIC_SIZE + 2 * TZ_INT_SIZE) as usize,
})
}
}
impl Index {
fn new(mut file: impl Read + Seek, header: Header) -> Result<Self, io::Error> {
file.seek(SeekFrom::Start(header.index_offset as u64))?;
let size = header.data_offset - header.index_offset;
let mut bytes = vec![0; size];
file.read_exact(&mut bytes)?;
Ok(Self(bytes))
}
fn find_entry(&self, name: &str) -> Option<IndexEntry> {
let name_bytes = name.as_bytes();
let name_len = name_bytes.len();
if name_len > TZ_NAME_SIZE {
return None;
}
let zeros = [0u8; TZ_NAME_SIZE];
let cmp = |chunk: &&[u8]| -> Ordering {
// tz names always have TZ_NAME_SIZE bytes and are right-padded with 0s
// so we check that a chunk starts with `name` and the remaining bytes are 0
chunk[..name_len]
.cmp(name_bytes)
.then_with(|| chunk[name_len..TZ_NAME_SIZE].cmp(&zeros[name_len..]))
};
let chunks: Vec<_> = self.0.chunks_exact(INDEX_ENTRY_SIZE).collect();
chunks
.binary_search_by(cmp)
.map(|idx| IndexEntry::new(chunks[idx]))
.ok()
}
}
impl<'a> IndexEntry<'a> {
fn new(bytes: &'a [u8]) -> Self {
Self {
_name: bytes[..TZ_NAME_SIZE]
.splitn(2, |&b| b == 0u8)
.next()
.unwrap(),
offset: parse_tz_int(bytes, TZ_NAME_SIZE) as usize,
length: parse_tz_int(bytes, TZ_NAME_SIZE + TZ_INT_SIZE) as usize,
_raw_utc_offset: parse_tz_int(bytes, TZ_NAME_SIZE + 2 * TZ_INT_SIZE) as usize,
}
}
}
/// Panics if slice does not contain [TZ_INT_SIZE] bytes beginning at start.
fn parse_tz_int(slice: &[u8], start: usize) -> u32 {
u32::from_be_bytes(slice[start..start + TZ_INT_SIZE].try_into().unwrap())
}
#[cfg(test)]
mod test {
use super::*;
use std::fs::File;
use std::io::Cursor;
#[test]
fn parse() {
let mut archive = File::open("tests/resources/tzdata.zip").unwrap();
let mut zip = zip::ZipArchive::new(&mut archive).unwrap();
let mut file = zip.by_index(0).unwrap();
let mut data = Vec::new();
file.read_to_end(&mut data).unwrap();
let cursor = Cursor::new(data);
let tz = find_tz_data_in_file(cursor, "Europe/Kiev").unwrap();
assert!(tz.starts_with(b"TZif2"));
}
}

View File

@ -0,0 +1 @@
{"files":{"CONTRIBUTING.md":"0834cb3b5e092977688d73d219a05bed23ae0ecb54b6d6e5d866ce07f6583b5e","Cargo.lock":"37ffc00dbbbec58fd27b4f4cb597e5402d6cf615ce0458f62a73a7f0d987e5bd","Cargo.toml":"e9e8c037cdef7adc9794b17c13e5a014421524d67ea5048bc09cf70ef13c782c","LICENSE-APACHE":"216486f29671a4262efe32af6d84a75bef398127f8c5f369b5c8305983887a06","LICENSE-MIT":"80f275e90d799911ed3830a7f242a2ef5a4ade2092fe0aa07bfb2d2cf2f2b95e","README.md":"aba8ff5dbd0712326d97d32bc6a3b66b24d1980a446c238f7e14b96784766cd1","examples/time_zone.rs":"8edb32a946ef2680146ba9ac16c233dd94391ac9f98464e9fb6f87d3954b72a9","src/lib.rs":"0004133d6c3805bf449e7183d2931e9640167511bea6cd12b400805073c4305d"},"package":"819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311"}

View File

@ -0,0 +1,40 @@
# Contributing
Contributions are very much welcome. Here are the guidelines if you are thinking of helping us:
## Contributions
Contributions should be made in the form of GitHub pull requests.
Each pull request will be reviewed by a core contributor (someone with
permission to land patches) and either landed in the main tree or
given feedback for changes that would be required.
Should you wish to work on an issue, please claim it first by commenting on
the GitHub issue that you want to work on it. This is to prevent duplicated
efforts from contributors on the same issue.
## Pull Request Checklist
- Branch from the master branch and, if needed, rebase to the current master
branch before submitting your pull request. If it doesn't merge cleanly with
master you may be asked to rebase your changes.
- Commits should be as small as possible, while ensuring that each commit is
correct independently (i.e., each commit should compile and pass tests).
- If your patch is not getting reviewed or you need a specific person to review
it, you can @-reply a reviewer asking for a review in the pull request or a
comment.
- Whenever applicable, add tests relevant to the fixed bug or new feature.
For specific git instructions, see [GitHub workflow 101](https://github.com/servo/servo/wiki/Github-workflow).
## Conduct
We follow the [Rust Code of Conduct](http://www.rust-lang.org/conduct.html).
For escalation or moderation issues, please contact Nical (nical@fastmail.com) instead of the Rust moderation team.
## License
Unless you explicitly state otherwise, any contribution intentionally submitted for inclusion in the work by you, as defined in the Apache-2.0 license, shall be licensed dual MIT/Apache 2, without any additional terms or conditions.

16
vendor/android_system_properties/Cargo.lock generated vendored Normal file
View File

@ -0,0 +1,16 @@
# This file is automatically @generated by Cargo.
# It is not intended for manual editing.
version = 3
[[package]]
name = "android_system_properties"
version = "0.1.5"
dependencies = [
"libc",
]
[[package]]
name = "libc"
version = "0.2.126"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "349d5a591cd28b49e1d1037471617a32ddcda5731b99419008085f72d5a53836"

View File

@ -0,0 +1,36 @@
# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO
#
# When uploading crates to the registry Cargo will automatically
# "normalize" Cargo.toml files for maximal compatibility
# with all versions of Cargo and also rewrite `path` dependencies
# to registry (e.g., crates.io) dependencies.
#
# If you are reading this file be aware that the original Cargo.toml
# will likely look very different (and much more reasonable).
# See Cargo.toml.orig for the original contents.
[package]
edition = "2018"
name = "android_system_properties"
version = "0.1.5"
authors = ["Nicolas Silva <nical@fastmail.com>"]
description = "Minimal Android system properties wrapper"
homepage = "https://github.com/nical/android_system_properties"
documentation = "https://docs.rs/android_system_properties"
readme = "README.md"
keywords = ["android"]
license = "MIT/Apache-2.0"
repository = "https://github.com/nical/android_system_properties"
[package.metadata.docs.rs]
targets = [
"arm-linux-androideabi",
"armv7-linux-androideabi",
"aarch64-linux-android",
"i686-linux-android",
"x86_64-linux-android",
"x86_64-unknown-linux-gnu",
]
[dependencies.libc]
version = "0.2.126"

View File

@ -0,0 +1,13 @@
Copyright 2016 Nicolas Silva
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

View File

@ -0,0 +1,20 @@
The MIT License (MIT)
Copyright (c) 2013 Nicolas Silva
Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
the Software without restriction, including without limitation the rights to
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
the Software, and to permit persons to whom the Software is furnished to do so,
subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

View File

@ -0,0 +1,36 @@
# android_system_properties
A thin rust wrapper for Android system properties.
This crate is similar to the `android-properties` crate with the exception that
the necessary Android libc symbols are loaded dynamically instead of linked
statically. In practice this means that the same binary will work with old and
new versions of Android, even though the API for reading system properties changed
around Android L.
## Example
```rust
use android_system_properties::AndroidSystemProperties;
let properties = AndroidSystemProperties::new();
if let Some(value) = properties.get("persist.sys.timezone") {
println!("{}", value);
}
```
## Listing and setting properties
For the sake of simplicity this crate currently only contains what's needed by wgpu.
The implementations for listing and setting properties can be added back if anyone needs
them (let me know by filing an issue).
## License
Licensed under either of
* Apache License, Version 2.0 ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
* MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
at your option.

View File

@ -0,0 +1,9 @@
/// Prints the current time zone, e.g. "Europe/Paris".
use android_system_properties::AndroidSystemProperties;
fn main() {
let android_system_properties = AndroidSystemProperties::new();
let tz = android_system_properties.get("persist.sys.timezone");
println!("Your time zone is: {}", tz.as_deref().unwrap_or("<unknown>"));
}

View File

@ -0,0 +1,221 @@
//! A thin rust wrapper for Android system properties.
//!
//! This crate is similar to the `android-properties` crate with the exception that
//! the necessary Android libc symbols are loaded dynamically instead of linked
//! statically. In practice this means that the same binary will work with old and
//! new versions of Android, even though the API for reading system properties changed
//! around Android L.
//!
//! ## Example
//!
//! ```rust
//! use android_system_properties::AndroidSystemProperties;
//!
//! let properties = AndroidSystemProperties::new();
//!
//! if let Some(value) = properties.get("persist.sys.timezone") {
//! println!("{}", value);
//! }
//! ```
//!
//! ## Listing and setting properties
//!
//! For the sake of simplicity this crate currently only contains what's needed by wgpu.
//! The implementations for listing and setting properties can be added back if anyone needs
//! them (let me know by filing an issue).
//!
//! ## License
//!
//! Licensed under either of
//!
//! * Apache License, Version 2.0 ([LICENSE-APACHE] or <http://www.apache.org/licenses/LICENSE-2.0>)
//! * MIT license ([LICENSE-MIT] or <http://opensource.org/licenses/MIT>)
//!
//! at your option.
//!
//! [LICENSE-APACHE]: https://github.com/nical/android_system_properties/blob/804681c5c1c93d4fab29c1a2f47b7d808dc70fd3/LICENSE-APACHE
//! [LICENSE-MIT]: https://github.com/nical/android_system_properties/blob/804681c5c1c93d4fab29c1a2f47b7d808dc70fd3/LICENSE-MIT
use std::{
ffi::{CStr, CString},
os::raw::{c_char, c_int, c_void},
};
#[cfg(target_os = "android")]
use std::mem;
unsafe fn property_callback(payload: *mut String, _name: *const c_char, value: *const c_char, _serial: u32) {
let cvalue = CStr::from_ptr(value);
(*payload) = cvalue.to_str().unwrap().to_string();
}
type Callback = unsafe fn(*mut String, *const c_char, *const c_char, u32);
type SystemPropertyGetFn = unsafe extern "C" fn(*const c_char, *mut c_char) -> c_int;
type SystemPropertyFindFn = unsafe extern "C" fn(*const c_char) -> *const c_void;
type SystemPropertyReadCallbackFn = unsafe extern "C" fn(*const c_void, Callback, *mut String) -> *const c_void;
#[derive(Debug)]
/// An object that can retrieve android system properties.
///
/// ## Example
///
/// ```
/// use android_system_properties::AndroidSystemProperties;
///
/// let properties = AndroidSystemProperties::new();
///
/// if let Some(value) = properties.get("persist.sys.timezone") {
/// println!("{}", value);
/// }
/// ```
pub struct AndroidSystemProperties {
libc_so: *mut c_void,
get_fn: Option<SystemPropertyGetFn>,
find_fn: Option<SystemPropertyFindFn>,
read_callback_fn: Option<SystemPropertyReadCallbackFn>,
}
unsafe impl Send for AndroidSystemProperties {}
unsafe impl Sync for AndroidSystemProperties {}
impl AndroidSystemProperties {
#[cfg(not(target_os = "android"))]
/// Create an entry point for accessing Android properties.
pub fn new() -> Self {
AndroidSystemProperties {
libc_so: std::ptr::null_mut(),
find_fn: None,
read_callback_fn: None,
get_fn: None,
}
}
#[cfg(target_os = "android")]
/// Create an entry point for accessing Android properties.
pub fn new() -> Self {
let libc_so = unsafe { libc::dlopen(b"libc.so\0".as_ptr().cast(), libc::RTLD_NOLOAD) };
let mut properties = AndroidSystemProperties {
libc_so,
find_fn: None,
read_callback_fn: None,
get_fn: None,
};
if libc_so.is_null() {
return properties;
}
unsafe fn load_fn(libc_so: *mut c_void, name: &[u8]) -> Option<*const c_void> {
let fn_ptr = libc::dlsym(libc_so, name.as_ptr().cast());
if fn_ptr.is_null() {
return None;
}
Some(fn_ptr)
}
unsafe {
properties.read_callback_fn = load_fn(libc_so, b"__system_property_read_callback\0")
.map(|raw| mem::transmute::<*const c_void, SystemPropertyReadCallbackFn>(raw));
properties.find_fn = load_fn(libc_so, b"__system_property_find\0")
.map(|raw| mem::transmute::<*const c_void, SystemPropertyFindFn>(raw));
// Fallback for old versions of Android.
if properties.read_callback_fn.is_none() || properties.find_fn.is_none() {
properties.get_fn = load_fn(libc_so, b"__system_property_get\0")
.map(|raw| mem::transmute::<*const c_void, SystemPropertyGetFn>(raw));
}
}
properties
}
/// Retrieve a system property.
///
/// Returns None if the operation fails.
///
/// # Example
///
/// ```
/// # use android_system_properties::AndroidSystemProperties;
/// let properties = AndroidSystemProperties::new();
///
/// if let Some(value) = properties.get("persist.sys.timezone") {
/// println!("{}", value);
/// }
/// ```
pub fn get(&self, name: &str) -> Option<String> {
let cname = CString::new(name).ok()?;
self.get_from_cstr(&cname)
}
/// Retrieve a system property using a [`CStr`] key.
///
/// Returns None if the operation fails.
///
/// # Example
///
/// ```
/// # use android_system_properties::AndroidSystemProperties;
/// # use std::ffi::CStr;
/// let properties = AndroidSystemProperties::new();
///
/// let key = unsafe { CStr::from_bytes_with_nul_unchecked(b"persist.sys.timezone\0") };
/// if let Some(value) = properties.get_from_cstr(key) {
/// println!("{}", value);
/// }
/// ```
pub fn get_from_cstr(&self, cname: &std::ffi::CStr) -> Option<String> {
// If available, use the recommended approach to accessing properties (Android L and onward).
if let (Some(find_fn), Some(read_callback_fn)) = (self.find_fn, self.read_callback_fn) {
let info = unsafe { (find_fn)(cname.as_ptr()) };
if info.is_null() {
return None;
}
let mut result = String::new();
unsafe {
(read_callback_fn)(info, property_callback, &mut result);
}
return Some(result);
}
// Fall back to the older approach.
if let Some(get_fn) = self.get_fn {
// The constant is PROP_VALUE_MAX in Android's libc/include/sys/system_properties.h
const PROPERTY_VALUE_MAX: usize = 92;
let mut buffer: Vec<u8> = Vec::with_capacity(PROPERTY_VALUE_MAX);
let raw = buffer.as_mut_ptr() as *mut c_char;
let len = unsafe { (get_fn)(cname.as_ptr(), raw) };
if len > 0 {
assert!(len as usize <= buffer.capacity());
unsafe { buffer.set_len(len as usize); }
String::from_utf8(buffer).ok()
} else {
None
}
} else {
None
}
}
}
impl Drop for AndroidSystemProperties {
fn drop(&mut self) {
if !self.libc_so.is_null() {
unsafe {
libc::dlclose(self.libc_so);
}
}
}
}

1
vendor/anstream/.cargo-checksum.json vendored Normal file
View File

@ -0,0 +1 @@
{"files":{"Cargo.lock":"23d8ed34328e75b81cc00af942f61b1bc05434435e92fca8a2d649abfc1b8f49","Cargo.toml":"ceca3cf1fc87f6ec6c2c687410fc2e60ed31bf58c39c54e8237603abc423b246","LICENSE-APACHE":"c6596eb7be8581c18be736c846fb9173b69eccf6ef94c5135893ec56bd92ba08","LICENSE-MIT":"6efb0476a1cc085077ed49357026d8c173bf33017278ef440f222fb9cbcb66e6","README.md":"b230c2257d0c7a49b9bd97f2fa73abedcdc055757b5cedd2b0eb1a7a448ff461","benches/stream.rs":"7e666c4f4b79ddb5237361ed25264a966ee241192fbb2c1baea3006e3e0326b4","benches/strip.rs":"9603bd5ca1ae4661c2ccab50315dbfdec0c661ac2624262172bbd8f5d0bd87c9","benches/wincon.rs":"680e86933c008b242a3286c5149c33d3c086426eb99fe134b6e79f7578f96663","examples/dump-stream.rs":"9c5791bd739c3a74cfc24da90a5f96ee448b71ecf9800d3934028c5d3deb28e6","examples/query-stream.rs":"16f38843083174fbefa974a5aa38a5f3ffa51bd6e6db3dc1d91164462219399e","src/adapter/mod.rs":"baf4237ea0b18df63609e49d93572ca27c2202a4cbec0220adb5a7e815c7d8ed","src/adapter/strip.rs":"b324562426cb7ad8bceeeb8ea012746b5a046f901ea878d6de8d61f96ec96a55","src/adapter/wincon.rs":"96ce7d753abb4d6ed42c044545a4f557455bb825432904d6316c0aa245eb0085","src/auto.rs":"aa7f0988fc1c3f8c0d5bf1ff12e108cc3eb29d330f28da02cb4a2e09ec9fcc7c","src/buffer.rs":"83e7088b50dd3e2941c06a417d9eef75fda45311a2912ba94f480ec98d6f0183","src/fmt.rs":"cc11b005c4559843bd908a57958a13c8d0922fae6aff5261f3583c90e60da73c","src/lib.rs":"7502dcd2be531d787eac7efde761c95c892a425b55a53d6d99ef5d99d912a5f0","src/macros.rs":"a26ababe32a39732d0aade9674f6e5e267bd26c6ea06603ff9e61e80681195e0","src/stream.rs":"cbe8f61fba4c3c60934339c8bda5d1ff43320f57cdc4ed409aa173945a941b3d","src/strip.rs":"09c8bcd5bda0b07b56929026d965222d8129908f8386350b87314bc5fefcc2fe","src/wincon.rs":"e85c03ccfeca352a32572db8bb6c903f78c2003f5b375254edc5a69d6843728f"},"package":"d96bd03f33fe50a863e394ee9718a706f988b9079b20c3784fb726e7678b62fb"}

1094
vendor/anstream/Cargo.lock generated vendored Normal file

File diff suppressed because it is too large Load Diff

140
vendor/anstream/Cargo.toml vendored Normal file
View File

@ -0,0 +1,140 @@
# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO
#
# When uploading crates to the registry Cargo will automatically
# "normalize" Cargo.toml files for maximal compatibility
# with all versions of Cargo and also rewrite `path` dependencies
# to registry (e.g., crates.io) dependencies.
#
# If you are reading this file be aware that the original Cargo.toml
# will likely look very different (and much more reasonable).
# See Cargo.toml.orig for the original contents.
[package]
edition = "2021"
rust-version = "1.70.0"
name = "anstream"
version = "0.6.13"
include = [
"build.rs",
"src/**/*",
"Cargo.toml",
"Cargo.lock",
"LICENSE*",
"README.md",
"benches/**/*",
"examples/**/*",
]
description = "A simple cross platform library for writing colored text to a terminal."
homepage = "https://github.com/rust-cli/anstyle"
readme = "README.md"
keywords = [
"ansi",
"terminal",
"color",
"strip",
"wincon",
]
categories = ["command-line-interface"]
license = "MIT OR Apache-2.0"
repository = "https://github.com/rust-cli/anstyle.git"
[package.metadata.docs.rs]
cargo-args = [
"-Zunstable-options",
"-Zrustdoc-scrape-examples",
]
rustdoc-args = [
"--cfg",
"docsrs",
]
[[package.metadata.release.pre-release-replacements]]
file = "CHANGELOG.md"
min = 1
replace = "{{version}}"
search = "Unreleased"
[[package.metadata.release.pre-release-replacements]]
exactly = 1
file = "CHANGELOG.md"
replace = "...{{tag_name}}"
search = '\.\.\.HEAD'
[[package.metadata.release.pre-release-replacements]]
file = "CHANGELOG.md"
min = 1
replace = "{{date}}"
search = "ReleaseDate"
[[package.metadata.release.pre-release-replacements]]
exactly = 1
file = "CHANGELOG.md"
replace = """
<!-- next-header -->
## [Unreleased] - ReleaseDate
"""
search = "<!-- next-header -->"
[[package.metadata.release.pre-release-replacements]]
exactly = 1
file = "CHANGELOG.md"
replace = """
<!-- next-url -->
[Unreleased]: https://github.com/rust-cli/anstyle/compare/{{tag_name}}...HEAD"""
search = "<!-- next-url -->"
[[bench]]
name = "strip"
harness = false
[[bench]]
name = "wincon"
harness = false
[[bench]]
name = "stream"
harness = false
[dependencies.anstyle]
version = "1.0.0"
[dependencies.anstyle-parse]
version = "0.2.0"
[dependencies.anstyle-query]
version = "1.0.0"
optional = true
[dependencies.colorchoice]
version = "1.0.0"
[dependencies.utf8parse]
version = "0.2.1"
[dev-dependencies.criterion]
version = "0.5.1"
[dev-dependencies.lexopt]
version = "0.3.0"
[dev-dependencies.owo-colors]
version = "4.0.0"
[dev-dependencies.proptest]
version = "1.4.0"
[dev-dependencies.strip-ansi-escapes]
version = "0.2.0"
[features]
auto = ["dep:anstyle-query"]
default = [
"auto",
"wincon",
]
test = []
wincon = ["dep:anstyle-wincon"]
[target."cfg(windows)".dependencies.anstyle-wincon]
version = "3.0.1"
optional = true

202
vendor/anstream/LICENSE-APACHE vendored Normal file
View File

@ -0,0 +1,202 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "{}"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright {yyyy} {name of copyright owner}
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

19
vendor/anstream/LICENSE-MIT vendored Normal file
View File

@ -0,0 +1,19 @@
Copyright (c) Individual contributors
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

34
vendor/anstream/README.md vendored Normal file
View File

@ -0,0 +1,34 @@
# anstream
> A simple cross platform library for writing colored text to a terminal.
*A portmanteau of "ansi stream"*
[![Documentation](https://img.shields.io/badge/docs-master-blue.svg)][Documentation]
![License](https://img.shields.io/crates/l/anstream.svg)
[![Crates Status](https://img.shields.io/crates/v/anstream.svg)](https://crates.io/crates/anstream)
Specialized `stdout` and `stderr` that accept ANSI escape codes and adapt them
based on the terminal's capabilities.
`anstream::adapter::strip_str` may also be of interest on its own for low
overhead stripping of ANSI escape codes.
## License
Licensed under either of
* Apache License, Version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
* MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
at your option.
### Contribution
Unless you explicitly state otherwise, any contribution intentionally
submitted for inclusion in the work by you, as defined in the Apache-2.0
license, shall be dual licensed as above, without any additional terms or
conditions.
[Crates.io]: https://crates.io/crates/anstream
[Documentation]: https://docs.rs/anstream

81
vendor/anstream/benches/stream.rs vendored Normal file
View File

@ -0,0 +1,81 @@
use std::io::Write as _;
use criterion::{black_box, Criterion};
fn stream(c: &mut Criterion) {
for (name, content) in [
("demo.vte", &include_bytes!("../tests/demo.vte")[..]),
("rg_help.vte", &include_bytes!("../tests/rg_help.vte")[..]),
("rg_linus.vte", &include_bytes!("../tests/rg_linus.vte")[..]),
(
"state_changes",
&b"\x1b]2;X\x1b\\ \x1b[0m \x1bP0@\x1b\\"[..],
),
] {
let mut group = c.benchmark_group(name);
group.bench_function("nop", |b| {
b.iter(|| {
let buffer = Vec::with_capacity(content.len());
let mut stream = buffer;
stream.write_all(content).unwrap();
black_box(stream)
})
});
group.bench_function("StripStream", |b| {
b.iter(|| {
let buffer = Vec::with_capacity(content.len());
let mut stream = anstream::StripStream::new(buffer);
stream.write_all(content).unwrap();
black_box(stream)
})
});
#[cfg(all(windows, feature = "wincon"))]
group.bench_function("WinconStream", |b| {
b.iter(|| {
let buffer = Vec::with_capacity(content.len());
let mut stream = anstream::WinconStream::new(buffer);
stream.write_all(content).unwrap();
black_box(stream)
})
});
group.bench_function("AutoStream::always_ansi", |b| {
b.iter(|| {
let buffer = Vec::with_capacity(content.len());
let mut stream = anstream::AutoStream::always_ansi(buffer);
stream.write_all(content).unwrap();
black_box(stream)
})
});
group.bench_function("AutoStream::always", |b| {
b.iter(|| {
let buffer = Vec::with_capacity(content.len());
let mut stream = anstream::AutoStream::always(buffer);
stream.write_all(content).unwrap();
black_box(stream)
})
});
group.bench_function("AutoStream::never", |b| {
b.iter(|| {
let buffer = Vec::with_capacity(content.len());
let mut stream = anstream::AutoStream::never(buffer);
stream.write_all(content).unwrap();
black_box(stream)
})
});
}
}
criterion::criterion_group!(benches, stream);
criterion::criterion_main!(benches);

102
vendor/anstream/benches/strip.rs vendored Normal file
View File

@ -0,0 +1,102 @@
use criterion::{black_box, Criterion};
#[derive(Default)]
struct Strip(String);
impl Strip {
fn with_capacity(capacity: usize) -> Self {
Self(String::with_capacity(capacity))
}
}
impl anstyle_parse::Perform for Strip {
fn print(&mut self, c: char) {
self.0.push(c);
}
fn execute(&mut self, byte: u8) {
if byte.is_ascii_whitespace() {
self.0.push(byte as char);
}
}
}
fn strip(c: &mut Criterion) {
for (name, content) in [
("demo.vte", &include_bytes!("../tests/demo.vte")[..]),
("rg_help.vte", &include_bytes!("../tests/rg_help.vte")[..]),
("rg_linus.vte", &include_bytes!("../tests/rg_linus.vte")[..]),
(
"state_changes",
&b"\x1b]2;X\x1b\\ \x1b[0m \x1bP0@\x1b\\"[..],
),
] {
// Make sure the comparison is fair
if let Ok(content) = std::str::from_utf8(content) {
let mut stripped = Strip::with_capacity(content.len());
let mut parser = anstyle_parse::Parser::<anstyle_parse::DefaultCharAccumulator>::new();
for byte in content.as_bytes() {
parser.advance(&mut stripped, *byte);
}
assert_eq!(
stripped.0,
anstream::adapter::strip_str(content).to_string()
);
assert_eq!(
stripped.0,
String::from_utf8(anstream::adapter::strip_bytes(content.as_bytes()).into_vec())
.unwrap()
);
}
let mut group = c.benchmark_group(name);
group.bench_function("advance_strip", |b| {
b.iter(|| {
let mut stripped = Strip::with_capacity(content.len());
let mut parser =
anstyle_parse::Parser::<anstyle_parse::DefaultCharAccumulator>::new();
for byte in content {
parser.advance(&mut stripped, *byte);
}
black_box(stripped.0)
})
});
group.bench_function("strip_ansi_escapes", |b| {
b.iter(|| {
let stripped = strip_ansi_escapes::strip(content);
black_box(stripped)
})
});
if let Ok(content) = std::str::from_utf8(content) {
group.bench_function("strip_str", |b| {
b.iter(|| {
let stripped = anstream::adapter::strip_str(content).to_string();
black_box(stripped)
})
});
group.bench_function("StripStr", |b| {
b.iter(|| {
let mut stripped = String::with_capacity(content.len());
let mut state = anstream::adapter::StripStr::new();
for printable in state.strip_next(content) {
stripped.push_str(printable);
}
black_box(stripped)
})
});
}
group.bench_function("strip_bytes", |b| {
b.iter(|| {
let stripped = anstream::adapter::strip_bytes(content).into_vec();
black_box(stripped)
})
});
}
}
criterion::criterion_group!(benches, strip);
criterion::criterion_main!(benches);

26
vendor/anstream/benches/wincon.rs vendored Normal file
View File

@ -0,0 +1,26 @@
use criterion::{black_box, Criterion};
fn wincon(c: &mut Criterion) {
for (name, content) in [
("demo.vte", &include_bytes!("../tests/demo.vte")[..]),
("rg_help.vte", &include_bytes!("../tests/rg_help.vte")[..]),
("rg_linus.vte", &include_bytes!("../tests/rg_linus.vte")[..]),
(
"state_changes",
&b"\x1b]2;X\x1b\\ \x1b[0m \x1bP0@\x1b\\"[..],
),
] {
let mut group = c.benchmark_group(name);
group.bench_function("wincon_bytes", |b| {
b.iter(|| {
let mut state = anstream::adapter::WinconBytes::new();
let stripped = state.extract_next(content).collect::<Vec<_>>();
black_box(stripped)
})
});
}
}
criterion::criterion_group!(benches, wincon);
criterion::criterion_main!(benches);

126
vendor/anstream/examples/dump-stream.rs vendored Normal file
View File

@ -0,0 +1,126 @@
use std::io::Write;
fn main() -> Result<(), lexopt::Error> {
let args = Args::parse()?;
let stdout = anstream::stdout();
let mut stdout = stdout.lock();
for fixed in 0..16 {
let color = anstyle::Ansi256Color(fixed).into_ansi().unwrap();
let style = style(color, args.layer, args.effects);
let _ = print_number(&mut stdout, fixed, style);
if fixed == 7 || fixed == 15 {
let _ = writeln!(&mut stdout);
}
}
for fixed in 16..232 {
let col = (fixed - 16) % 36;
if col == 0 {
let _ = writeln!(stdout);
}
let color = anstyle::Ansi256Color(fixed);
let style = style(color, args.layer, args.effects);
let _ = print_number(&mut stdout, fixed, style);
}
let _ = writeln!(stdout);
let _ = writeln!(stdout);
for fixed in 232..=255 {
let color = anstyle::Ansi256Color(fixed);
let style = style(color, args.layer, args.effects);
let _ = print_number(&mut stdout, fixed, style);
}
let _ = writeln!(stdout);
Ok(())
}
fn style(
color: impl Into<anstyle::Color>,
layer: Layer,
effects: anstyle::Effects,
) -> anstyle::Style {
let color = color.into();
(match layer {
Layer::Fg => anstyle::Style::new().fg_color(Some(color)),
Layer::Bg => anstyle::Style::new().bg_color(Some(color)),
Layer::Underline => anstyle::Style::new().underline_color(Some(color)),
}) | effects
}
fn print_number(stdout: &mut impl Write, fixed: u8, style: anstyle::Style) -> std::io::Result<()> {
write!(stdout, "{style}{fixed:>3X}{style:#}",)
}
#[derive(Default)]
struct Args {
effects: anstyle::Effects,
layer: Layer,
}
#[derive(Copy, Clone, Default)]
enum Layer {
#[default]
Fg,
Bg,
Underline,
}
impl Args {
fn parse() -> Result<Self, lexopt::Error> {
use lexopt::prelude::*;
let mut res = Args::default();
let mut args = lexopt::Parser::from_env();
while let Some(arg) = args.next()? {
match arg {
Long("layer") => {
res.layer = args.value()?.parse_with(|s| match s {
"fg" => Ok(Layer::Fg),
"bg" => Ok(Layer::Bg),
"underline" => Ok(Layer::Underline),
_ => Err("expected values fg, bg, underline"),
})?;
}
Long("effect") => {
const EFFECTS: [(&str, anstyle::Effects); 12] = [
("bold", anstyle::Effects::BOLD),
("dimmed", anstyle::Effects::DIMMED),
("italic", anstyle::Effects::ITALIC),
("underline", anstyle::Effects::UNDERLINE),
("double_underline", anstyle::Effects::DOUBLE_UNDERLINE),
("curly_underline", anstyle::Effects::CURLY_UNDERLINE),
("dotted_underline", anstyle::Effects::DOTTED_UNDERLINE),
("dashed_underline", anstyle::Effects::DASHED_UNDERLINE),
("blink", anstyle::Effects::BLINK),
("invert", anstyle::Effects::INVERT),
("hidden", anstyle::Effects::HIDDEN),
("strikethrough", anstyle::Effects::STRIKETHROUGH),
];
let effect = args.value()?.parse_with(|s| {
EFFECTS
.into_iter()
.find(|(name, _)| *name == s)
.map(|(_, effect)| effect)
.ok_or_else(|| {
format!(
"expected one of {}",
EFFECTS
.into_iter()
.map(|(n, _)| n)
.collect::<Vec<_>>()
.join(", ")
)
})
})?;
res.effects = res.effects.insert(effect);
}
_ => return Err(arg.unexpected()),
}
}
Ok(res)
}
}

View File

@ -0,0 +1,20 @@
fn main() {
println!("stdout:");
println!(
" choice: {:?}",
anstream::AutoStream::choice(&std::io::stdout())
);
println!(
" choice: {:?}",
anstream::AutoStream::auto(std::io::stdout()).current_choice()
);
println!("stderr:");
println!(
" choice: {:?}",
anstream::AutoStream::choice(&std::io::stderr())
);
println!(
" choice: {:?}",
anstream::AutoStream::auto(std::io::stderr()).current_choice()
);
}

15
vendor/anstream/src/adapter/mod.rs vendored Normal file
View File

@ -0,0 +1,15 @@
//! Gracefully degrade styled output
mod strip;
mod wincon;
pub use strip::strip_bytes;
pub use strip::strip_str;
pub use strip::StripBytes;
pub use strip::StripBytesIter;
pub use strip::StripStr;
pub use strip::StripStrIter;
pub use strip::StrippedBytes;
pub use strip::StrippedStr;
pub use wincon::WinconBytes;
pub use wincon::WinconBytesIter;

509
vendor/anstream/src/adapter/strip.rs vendored Normal file
View File

@ -0,0 +1,509 @@
use anstyle_parse::state::state_change;
use anstyle_parse::state::Action;
use anstyle_parse::state::State;
/// Strip ANSI escapes from a `&str`, returning the printable content
///
/// This can be used to take output from a program that includes escape sequences and write it
/// somewhere that does not easily support them, such as a log file.
///
/// For non-contiguous data, see [`StripStr`].
///
/// # Example
///
/// ```rust
/// use std::io::Write as _;
///
/// let styled_text = "\x1b[32mfoo\x1b[m bar";
/// let plain_str = anstream::adapter::strip_str(&styled_text).to_string();
/// assert_eq!(plain_str, "foo bar");
/// ```
#[inline]
pub fn strip_str(data: &str) -> StrippedStr<'_> {
StrippedStr::new(data)
}
/// See [`strip_str`]
#[derive(Default, Clone, Debug, PartialEq, Eq)]
pub struct StrippedStr<'s> {
bytes: &'s [u8],
state: State,
}
impl<'s> StrippedStr<'s> {
#[inline]
fn new(data: &'s str) -> Self {
Self {
bytes: data.as_bytes(),
state: State::Ground,
}
}
/// Create a [`String`] of the printable content
#[inline]
#[allow(clippy::inherent_to_string_shadow_display)] // Single-allocation implementation
pub fn to_string(&self) -> String {
use std::fmt::Write as _;
let mut stripped = String::with_capacity(self.bytes.len());
let _ = write!(&mut stripped, "{}", self);
stripped
}
}
impl<'s> std::fmt::Display for StrippedStr<'s> {
/// **Note:** this does *not* exhaust the [`Iterator`]
#[inline]
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
let iter = Self {
bytes: self.bytes,
state: self.state,
};
for printable in iter {
printable.fmt(f)?;
}
Ok(())
}
}
impl<'s> Iterator for StrippedStr<'s> {
type Item = &'s str;
#[inline]
fn next(&mut self) -> Option<Self::Item> {
next_str(&mut self.bytes, &mut self.state)
}
}
/// Incrementally strip non-contiguous data
#[derive(Default, Clone, Debug, PartialEq, Eq)]
pub struct StripStr {
state: State,
}
impl StripStr {
/// Initial state
pub fn new() -> Self {
Default::default()
}
/// Strip the next segment of data
pub fn strip_next<'s>(&'s mut self, data: &'s str) -> StripStrIter<'s> {
StripStrIter {
bytes: data.as_bytes(),
state: &mut self.state,
}
}
}
/// See [`StripStr`]
#[derive(Debug, PartialEq, Eq)]
pub struct StripStrIter<'s> {
bytes: &'s [u8],
state: &'s mut State,
}
impl<'s> Iterator for StripStrIter<'s> {
type Item = &'s str;
#[inline]
fn next(&mut self) -> Option<Self::Item> {
next_str(&mut self.bytes, self.state)
}
}
#[inline]
fn next_str<'s>(bytes: &mut &'s [u8], state: &mut State) -> Option<&'s str> {
let offset = bytes.iter().copied().position(|b| {
let (next_state, action) = state_change(*state, b);
if next_state != State::Anywhere {
*state = next_state;
}
is_printable_bytes(action, b)
});
let (_, next) = bytes.split_at(offset.unwrap_or(bytes.len()));
*bytes = next;
*state = State::Ground;
let offset = bytes.iter().copied().position(|b| {
let (_next_state, action) = state_change(State::Ground, b);
!(is_printable_bytes(action, b) || is_utf8_continuation(b))
});
let (printable, next) = bytes.split_at(offset.unwrap_or(bytes.len()));
*bytes = next;
if printable.is_empty() {
None
} else {
let printable = unsafe {
from_utf8_unchecked(
printable,
"`bytes` was validated as UTF-8, the parser preserves UTF-8 continuations",
)
};
Some(printable)
}
}
#[inline]
unsafe fn from_utf8_unchecked<'b>(bytes: &'b [u8], safety_justification: &'static str) -> &'b str {
if cfg!(debug_assertions) {
// Catch problems more quickly when testing
std::str::from_utf8(bytes).expect(safety_justification)
} else {
std::str::from_utf8_unchecked(bytes)
}
}
#[inline]
fn is_utf8_continuation(b: u8) -> bool {
matches!(b, 0x80..=0xbf)
}
/// Strip ANSI escapes from bytes, returning the printable content
///
/// This can be used to take output from a program that includes escape sequences and write it
/// somewhere that does not easily support them, such as a log file.
///
/// # Example
///
/// ```rust
/// use std::io::Write as _;
///
/// let styled_text = "\x1b[32mfoo\x1b[m bar";
/// let plain_str = anstream::adapter::strip_bytes(styled_text.as_bytes()).into_vec();
/// assert_eq!(plain_str.as_slice(), &b"foo bar"[..]);
/// ```
#[inline]
pub fn strip_bytes(data: &[u8]) -> StrippedBytes<'_> {
StrippedBytes::new(data)
}
/// See [`strip_bytes`]
#[derive(Default, Clone, Debug, PartialEq, Eq)]
pub struct StrippedBytes<'s> {
bytes: &'s [u8],
state: State,
utf8parser: Utf8Parser,
}
impl<'s> StrippedBytes<'s> {
/// See [`strip_bytes`]
#[inline]
pub fn new(bytes: &'s [u8]) -> Self {
Self {
bytes,
state: State::Ground,
utf8parser: Default::default(),
}
}
/// Strip the next slice of bytes
///
/// Used when the content is in several non-contiguous slices
///
/// # Panic
///
/// May panic if it is not exhausted / empty
#[inline]
pub fn extend(&mut self, bytes: &'s [u8]) {
debug_assert!(
self.is_empty(),
"current bytes must be processed to ensure we end at the right state"
);
self.bytes = bytes;
}
/// Report the bytes has been exhausted
#[inline]
pub fn is_empty(&self) -> bool {
self.bytes.is_empty()
}
/// Create a [`Vec`] of the printable content
#[inline]
pub fn into_vec(self) -> Vec<u8> {
let mut stripped = Vec::with_capacity(self.bytes.len());
for printable in self {
stripped.extend(printable);
}
stripped
}
}
impl<'s> Iterator for StrippedBytes<'s> {
type Item = &'s [u8];
#[inline]
fn next(&mut self) -> Option<Self::Item> {
next_bytes(&mut self.bytes, &mut self.state, &mut self.utf8parser)
}
}
/// Incrementally strip non-contiguous data
#[derive(Default, Clone, Debug, PartialEq, Eq)]
pub struct StripBytes {
state: State,
utf8parser: Utf8Parser,
}
impl StripBytes {
/// Initial state
pub fn new() -> Self {
Default::default()
}
/// Strip the next segment of data
pub fn strip_next<'s>(&'s mut self, bytes: &'s [u8]) -> StripBytesIter<'s> {
StripBytesIter {
bytes,
state: &mut self.state,
utf8parser: &mut self.utf8parser,
}
}
}
/// See [`StripBytes`]
#[derive(Debug, PartialEq, Eq)]
pub struct StripBytesIter<'s> {
bytes: &'s [u8],
state: &'s mut State,
utf8parser: &'s mut Utf8Parser,
}
impl<'s> Iterator for StripBytesIter<'s> {
type Item = &'s [u8];
#[inline]
fn next(&mut self) -> Option<Self::Item> {
next_bytes(&mut self.bytes, self.state, self.utf8parser)
}
}
#[inline]
fn next_bytes<'s>(
bytes: &mut &'s [u8],
state: &mut State,
utf8parser: &mut Utf8Parser,
) -> Option<&'s [u8]> {
let offset = bytes.iter().copied().position(|b| {
if *state == State::Utf8 {
true
} else {
let (next_state, action) = state_change(*state, b);
if next_state != State::Anywhere {
*state = next_state;
}
is_printable_bytes(action, b)
}
});
let (_, next) = bytes.split_at(offset.unwrap_or(bytes.len()));
*bytes = next;
let offset = bytes.iter().copied().position(|b| {
if *state == State::Utf8 {
if utf8parser.add(b) {
*state = State::Ground;
}
false
} else {
let (next_state, action) = state_change(State::Ground, b);
if next_state != State::Anywhere {
*state = next_state;
}
if *state == State::Utf8 {
utf8parser.add(b);
false
} else {
!is_printable_bytes(action, b)
}
}
});
let (printable, next) = bytes.split_at(offset.unwrap_or(bytes.len()));
*bytes = next;
if printable.is_empty() {
None
} else {
Some(printable)
}
}
#[derive(Default, Clone, Debug, PartialEq, Eq)]
pub struct Utf8Parser {
utf8_parser: utf8parse::Parser,
}
impl Utf8Parser {
fn add(&mut self, byte: u8) -> bool {
let mut b = false;
let mut receiver = VtUtf8Receiver(&mut b);
self.utf8_parser.advance(&mut receiver, byte);
b
}
}
struct VtUtf8Receiver<'a>(&'a mut bool);
impl<'a> utf8parse::Receiver for VtUtf8Receiver<'a> {
fn codepoint(&mut self, _: char) {
*self.0 = true;
}
fn invalid_sequence(&mut self) {
*self.0 = true;
}
}
#[inline]
fn is_printable_bytes(action: Action, byte: u8) -> bool {
// VT320 considered 0x7f to be `Print`able but we expect to be working in UTF-8 systems and not
// ISO Latin-1, making it DEL and non-printable
const DEL: u8 = 0x7f;
// Continuations aren't included as they may also be control codes, requiring more context
(action == Action::Print && byte != DEL)
|| action == Action::BeginUtf8
|| (action == Action::Execute && byte.is_ascii_whitespace())
}
#[cfg(test)]
mod test {
use super::*;
use proptest::prelude::*;
/// Model based off full parser
fn parser_strip(bytes: &[u8]) -> String {
#[derive(Default)]
struct Strip(String);
impl Strip {
fn with_capacity(capacity: usize) -> Self {
Self(String::with_capacity(capacity))
}
}
impl anstyle_parse::Perform for Strip {
fn print(&mut self, c: char) {
self.0.push(c);
}
fn execute(&mut self, byte: u8) {
if byte.is_ascii_whitespace() {
self.0.push(byte as char);
}
}
}
let mut stripped = Strip::with_capacity(bytes.len());
let mut parser = anstyle_parse::Parser::<anstyle_parse::DefaultCharAccumulator>::new();
for byte in bytes {
parser.advance(&mut stripped, *byte);
}
stripped.0
}
/// Model verifying incremental parsing
fn strip_char(mut s: &str) -> String {
let mut result = String::new();
let mut state = StripStr::new();
while !s.is_empty() {
let mut indices = s.char_indices();
indices.next(); // current
let offset = indices.next().map(|(i, _)| i).unwrap_or_else(|| s.len());
let (current, remainder) = s.split_at(offset);
for printable in state.strip_next(current) {
result.push_str(printable);
}
s = remainder;
}
result
}
/// Model verifying incremental parsing
fn strip_byte(s: &[u8]) -> Vec<u8> {
let mut result = Vec::new();
let mut state = StripBytes::default();
for start in 0..s.len() {
let current = &s[start..=start];
for printable in state.strip_next(current) {
result.extend(printable);
}
}
result
}
#[test]
fn test_strip_bytes_multibyte() {
let bytes = [240, 145, 141, 139];
let expected = parser_strip(&bytes);
let actual = String::from_utf8(strip_bytes(&bytes).into_vec()).unwrap();
assert_eq!(expected, actual);
}
#[test]
fn test_strip_byte_multibyte() {
let bytes = [240, 145, 141, 139];
let expected = parser_strip(&bytes);
let actual = String::from_utf8(strip_byte(&bytes).to_vec()).unwrap();
assert_eq!(expected, actual);
}
#[test]
fn test_strip_str_del() {
let input = std::str::from_utf8(&[0x7f]).unwrap();
let expected = "";
let actual = strip_str(input).to_string();
assert_eq!(expected, actual);
}
#[test]
fn test_strip_byte_del() {
let bytes = [0x7f];
let expected = "";
let actual = String::from_utf8(strip_byte(&bytes).to_vec()).unwrap();
assert_eq!(expected, actual);
}
#[test]
fn test_strip_str_handles_broken_sequence() {
// valid utf8: \xc3\xb6 then \x1b then \xf0\x9f\x98\x80
let s = "ö\x1b😀hello😀goodbye";
let mut it = strip_str(s);
assert_eq!("ö", it.next().unwrap());
assert_eq!("ello😀goodbye", it.next().unwrap());
}
proptest! {
#[test]
#[cfg_attr(miri, ignore)] // See https://github.com/AltSysrq/proptest/issues/253
fn strip_str_no_escapes(s in "\\PC*") {
let expected = parser_strip(s.as_bytes());
let actual = strip_str(&s).to_string();
assert_eq!(expected, actual);
}
#[test]
#[cfg_attr(miri, ignore)] // See https://github.com/AltSysrq/proptest/issues/253
fn strip_char_no_escapes(s in "\\PC*") {
let expected = parser_strip(s.as_bytes());
let actual = strip_char(&s);
assert_eq!(expected, actual);
}
#[test]
#[cfg_attr(miri, ignore)] // See https://github.com/AltSysrq/proptest/issues/253
fn strip_bytes_no_escapes(s in "\\PC*") {
dbg!(&s);
dbg!(s.as_bytes());
let expected = parser_strip(s.as_bytes());
let actual = String::from_utf8(strip_bytes(s.as_bytes()).into_vec()).unwrap();
assert_eq!(expected, actual);
}
#[test]
#[cfg_attr(miri, ignore)] // See https://github.com/AltSysrq/proptest/issues/253
fn strip_byte_no_escapes(s in "\\PC*") {
dbg!(&s);
dbg!(s.as_bytes());
let expected = parser_strip(s.as_bytes());
let actual = String::from_utf8(strip_byte(s.as_bytes()).to_vec()).unwrap();
assert_eq!(expected, actual);
}
}
}

383
vendor/anstream/src/adapter/wincon.rs vendored Normal file
View File

@ -0,0 +1,383 @@
/// Incrementally convert to wincon calls for non-contiguous data
#[derive(Default, Clone, Debug, PartialEq, Eq)]
pub struct WinconBytes {
parser: anstyle_parse::Parser,
capture: WinconCapture,
}
impl WinconBytes {
/// Initial state
pub fn new() -> Self {
Default::default()
}
/// Strip the next segment of data
pub fn extract_next<'s>(&'s mut self, bytes: &'s [u8]) -> WinconBytesIter<'s> {
self.capture.reset();
self.capture.printable.reserve(bytes.len());
WinconBytesIter {
bytes,
parser: &mut self.parser,
capture: &mut self.capture,
}
}
}
/// See [`WinconBytes`]
#[derive(Debug, PartialEq, Eq)]
pub struct WinconBytesIter<'s> {
bytes: &'s [u8],
parser: &'s mut anstyle_parse::Parser,
capture: &'s mut WinconCapture,
}
impl<'s> Iterator for WinconBytesIter<'s> {
type Item = (anstyle::Style, String);
#[inline]
fn next(&mut self) -> Option<Self::Item> {
next_bytes(&mut self.bytes, self.parser, self.capture)
}
}
#[inline]
fn next_bytes(
bytes: &mut &[u8],
parser: &mut anstyle_parse::Parser,
capture: &mut WinconCapture,
) -> Option<(anstyle::Style, String)> {
capture.reset();
while capture.ready.is_none() {
let byte = if let Some((byte, remainder)) = (*bytes).split_first() {
*bytes = remainder;
*byte
} else {
break;
};
parser.advance(capture, byte);
}
if capture.printable.is_empty() {
return None;
}
let style = capture.ready.unwrap_or(capture.style);
Some((style, std::mem::take(&mut capture.printable)))
}
#[derive(Default, Clone, Debug, PartialEq, Eq)]
struct WinconCapture {
style: anstyle::Style,
printable: String,
ready: Option<anstyle::Style>,
}
impl WinconCapture {
fn reset(&mut self) {
self.ready = None;
}
}
impl anstyle_parse::Perform for WinconCapture {
/// Draw a character to the screen and update states.
fn print(&mut self, c: char) {
self.printable.push(c);
}
/// Execute a C0 or C1 control function.
fn execute(&mut self, byte: u8) {
if byte.is_ascii_whitespace() {
self.printable.push(byte as char);
}
}
fn csi_dispatch(
&mut self,
params: &anstyle_parse::Params,
_intermediates: &[u8],
ignore: bool,
action: u8,
) {
if ignore {
return;
}
if action != b'm' {
return;
}
let mut style = self.style;
// param/value differences are dependent on the escape code
let mut state = State::Normal;
let mut r = None;
let mut g = None;
let mut color_target = ColorTarget::Fg;
for param in params {
for value in param {
match (state, *value) {
(State::Normal, 0) => {
style = anstyle::Style::default();
break;
}
(State::Normal, 1) => {
style = style.bold();
break;
}
(State::Normal, 2) => {
style = style.dimmed();
break;
}
(State::Normal, 3) => {
style = style.italic();
break;
}
(State::Normal, 4) => {
style = style.underline();
state = State::Underline;
}
(State::Normal, 21) => {
style |= anstyle::Effects::DOUBLE_UNDERLINE;
break;
}
(State::Normal, 7) => {
style = style.invert();
break;
}
(State::Normal, 8) => {
style = style.hidden();
break;
}
(State::Normal, 9) => {
style = style.strikethrough();
break;
}
(State::Normal, 30..=37) => {
let color = to_ansi_color(value - 30).unwrap();
style = style.fg_color(Some(color.into()));
break;
}
(State::Normal, 38) => {
color_target = ColorTarget::Fg;
state = State::PrepareCustomColor;
}
(State::Normal, 39) => {
style = style.fg_color(None);
break;
}
(State::Normal, 40..=47) => {
let color = to_ansi_color(value - 40).unwrap();
style = style.bg_color(Some(color.into()));
break;
}
(State::Normal, 48) => {
color_target = ColorTarget::Bg;
state = State::PrepareCustomColor;
}
(State::Normal, 49) => {
style = style.bg_color(None);
break;
}
(State::Normal, 58) => {
color_target = ColorTarget::Underline;
state = State::PrepareCustomColor;
}
(State::Normal, 90..=97) => {
let color = to_ansi_color(value - 90).unwrap().bright(true);
style = style.fg_color(Some(color.into()));
break;
}
(State::Normal, 100..=107) => {
let color = to_ansi_color(value - 100).unwrap().bright(true);
style = style.bg_color(Some(color.into()));
break;
}
(State::PrepareCustomColor, 5) => {
state = State::Ansi256;
}
(State::PrepareCustomColor, 2) => {
state = State::Rgb;
r = None;
g = None;
}
(State::Ansi256, n) => {
let color = anstyle::Ansi256Color(n as u8);
style = match color_target {
ColorTarget::Fg => style.fg_color(Some(color.into())),
ColorTarget::Bg => style.bg_color(Some(color.into())),
ColorTarget::Underline => style.underline_color(Some(color.into())),
};
break;
}
(State::Rgb, b) => match (r, g) {
(None, _) => {
r = Some(b);
}
(Some(_), None) => {
g = Some(b);
}
(Some(r), Some(g)) => {
let color = anstyle::RgbColor(r as u8, g as u8, b as u8);
style = match color_target {
ColorTarget::Fg => style.fg_color(Some(color.into())),
ColorTarget::Bg => style.bg_color(Some(color.into())),
ColorTarget::Underline => style.underline_color(Some(color.into())),
};
break;
}
},
(State::Underline, 0) => {
style =
style.effects(style.get_effects().remove(anstyle::Effects::UNDERLINE));
}
(State::Underline, 1) => {
// underline already set
}
(State::Underline, 2) => {
style = style
.effects(style.get_effects().remove(anstyle::Effects::UNDERLINE))
| anstyle::Effects::DOUBLE_UNDERLINE;
}
(State::Underline, 3) => {
style = style
.effects(style.get_effects().remove(anstyle::Effects::UNDERLINE))
| anstyle::Effects::CURLY_UNDERLINE;
}
(State::Underline, 4) => {
style = style
.effects(style.get_effects().remove(anstyle::Effects::UNDERLINE))
| anstyle::Effects::DOTTED_UNDERLINE;
}
(State::Underline, 5) => {
style = style
.effects(style.get_effects().remove(anstyle::Effects::UNDERLINE))
| anstyle::Effects::DASHED_UNDERLINE;
}
_ => {
break;
}
}
}
}
if style != self.style && !self.printable.is_empty() {
self.ready = Some(self.style);
}
self.style = style;
}
}
#[derive(Copy, Clone, PartialEq, Eq, Debug)]
enum State {
Normal,
PrepareCustomColor,
Ansi256,
Rgb,
Underline,
}
#[derive(Copy, Clone, PartialEq, Eq, Debug)]
enum ColorTarget {
Fg,
Bg,
Underline,
}
fn to_ansi_color(digit: u16) -> Option<anstyle::AnsiColor> {
match digit {
0 => Some(anstyle::AnsiColor::Black),
1 => Some(anstyle::AnsiColor::Red),
2 => Some(anstyle::AnsiColor::Green),
3 => Some(anstyle::AnsiColor::Yellow),
4 => Some(anstyle::AnsiColor::Blue),
5 => Some(anstyle::AnsiColor::Magenta),
6 => Some(anstyle::AnsiColor::Cyan),
7 => Some(anstyle::AnsiColor::White),
_ => None,
}
}
#[cfg(test)]
mod test {
use super::*;
use owo_colors::OwoColorize as _;
use proptest::prelude::*;
#[track_caller]
fn verify(input: &str, expected: Vec<(anstyle::Style, &str)>) {
let expected = expected
.into_iter()
.map(|(style, value)| (style, value.to_owned()))
.collect::<Vec<_>>();
let mut state = WinconBytes::new();
let actual = state.extract_next(input.as_bytes()).collect::<Vec<_>>();
assert_eq!(expected, actual, "{input:?}");
}
#[test]
fn start() {
let input = format!("{} world!", "Hello".green().on_red());
let expected = vec![
(
anstyle::AnsiColor::Green.on(anstyle::AnsiColor::Red),
"Hello",
),
(anstyle::Style::default(), " world!"),
];
verify(&input, expected);
}
#[test]
fn middle() {
let input = format!("Hello {}!", "world".green().on_red());
let expected = vec![
(anstyle::Style::default(), "Hello "),
(
anstyle::AnsiColor::Green.on(anstyle::AnsiColor::Red),
"world",
),
(anstyle::Style::default(), "!"),
];
verify(&input, expected);
}
#[test]
fn end() {
let input = format!("Hello {}", "world!".green().on_red());
let expected = vec![
(anstyle::Style::default(), "Hello "),
(
anstyle::AnsiColor::Green.on(anstyle::AnsiColor::Red),
"world!",
),
];
verify(&input, expected);
}
#[test]
fn ansi256_colors() {
// termcolor only supports "brights" via these
let input = format!(
"Hello {}!",
"world".color(owo_colors::XtermColors::UserBrightYellow)
);
let expected = vec![
(anstyle::Style::default(), "Hello "),
(anstyle::Ansi256Color(11).on_default(), "world"),
(anstyle::Style::default(), "!"),
];
verify(&input, expected);
}
proptest! {
#[test]
#[cfg_attr(miri, ignore)] // See https://github.com/AltSysrq/proptest/issues/253
fn wincon_no_escapes(s in "\\PC*") {
let expected = if s.is_empty() {
vec![]
} else {
vec![(anstyle::Style::default(), s.clone())]
};
let mut state = WinconBytes::new();
let actual = state.extract_next(s.as_bytes()).collect::<Vec<_>>();
assert_eq!(expected, actual);
}
}
}

301
vendor/anstream/src/auto.rs vendored Normal file
View File

@ -0,0 +1,301 @@
use crate::stream::AsLockedWrite;
use crate::stream::RawStream;
use crate::ColorChoice;
use crate::StripStream;
#[cfg(all(windows, feature = "wincon"))]
use crate::WinconStream;
/// [`std::io::Write`] that adapts ANSI escape codes to the underlying `Write`s capabilities
///
/// This includes
/// - Stripping colors for non-terminals
/// - Respecting env variables like [NO_COLOR](https://no-color.org/) or [CLICOLOR](https://bixense.com/clicolors/)
/// - *(windows)* Falling back to the wincon API where [ENABLE_VIRTUAL_TERMINAL_PROCESSING](https://learn.microsoft.com/en-us/windows/console/console-virtual-terminal-sequences#output-sequences) is unsupported
///
/// You can customize auto-detection by calling into
/// [anstyle_query](https://docs.rs/anstyle-query/latest/anstyle_query/)
/// to get a [`ColorChoice`] and then calling [`AutoStream::new(stream, choice)`].
#[derive(Debug)]
pub struct AutoStream<S: RawStream> {
inner: StreamInner<S>,
}
#[derive(Debug)]
enum StreamInner<S: RawStream> {
PassThrough(S),
Strip(StripStream<S>),
#[cfg(all(windows, feature = "wincon"))]
Wincon(WinconStream<S>),
}
impl<S> AutoStream<S>
where
S: RawStream,
{
/// Runtime control over styling behavior
///
/// # Example
///
/// ```rust
/// # #[cfg(feature = "auto")] {
/// # use std::io::IsTerminal as _;
/// // Like `AutoStream::choice` but without `NO_COLOR`, `CLICOLOR_FORCE`, `CI`
/// fn choice(raw: &dyn anstream::stream::RawStream) -> anstream::ColorChoice {
/// let choice = anstream::ColorChoice::global();
/// if choice == anstream::ColorChoice::Auto {
/// if raw.is_terminal() && anstyle_query::term_supports_color() {
/// anstream::ColorChoice::Always
/// } else {
/// anstream::ColorChoice::Never
/// }
/// } else {
/// choice
/// }
/// }
///
/// let stream = std::io::stdout();
/// let choice = choice(&stream);
/// let auto = anstream::AutoStream::new(stream, choice);
/// # }
/// ```
#[inline]
pub fn new(raw: S, choice: ColorChoice) -> Self {
match choice {
#[cfg(feature = "auto")]
ColorChoice::Auto => Self::auto(raw),
#[cfg(not(feature = "auto"))]
ColorChoice::Auto => Self::never(raw),
ColorChoice::AlwaysAnsi => Self::always_ansi(raw),
ColorChoice::Always => Self::always(raw),
ColorChoice::Never => Self::never(raw),
}
}
/// Auto-adapt for the stream's capabilities
#[cfg(feature = "auto")]
#[inline]
pub fn auto(raw: S) -> Self {
let choice = Self::choice(&raw);
debug_assert_ne!(choice, ColorChoice::Auto);
Self::new(raw, choice)
}
/// Report the desired choice for the given stream
#[cfg(feature = "auto")]
pub fn choice(raw: &S) -> ColorChoice {
choice(raw)
}
/// Force ANSI escape codes to be passed through as-is, no matter what the inner `Write`
/// supports.
#[inline]
pub fn always_ansi(raw: S) -> Self {
#[cfg(feature = "auto")]
{
if raw.is_terminal() {
let _ = anstyle_query::windows::enable_ansi_colors();
}
}
Self::always_ansi_(raw)
}
#[inline]
fn always_ansi_(raw: S) -> Self {
let inner = StreamInner::PassThrough(raw);
AutoStream { inner }
}
/// Force color, no matter what the inner `Write` supports.
#[inline]
pub fn always(raw: S) -> Self {
if cfg!(windows) {
#[cfg(feature = "auto")]
let use_wincon = raw.is_terminal()
&& !anstyle_query::windows::enable_ansi_colors().unwrap_or(true)
&& !anstyle_query::term_supports_ansi_color();
#[cfg(not(feature = "auto"))]
let use_wincon = true;
if use_wincon {
Self::wincon(raw).unwrap_or_else(|raw| Self::always_ansi_(raw))
} else {
Self::always_ansi_(raw)
}
} else {
Self::always_ansi(raw)
}
}
/// Only pass printable data to the inner `Write`.
#[inline]
pub fn never(raw: S) -> Self {
let inner = StreamInner::Strip(StripStream::new(raw));
AutoStream { inner }
}
#[inline]
fn wincon(raw: S) -> Result<Self, S> {
#[cfg(all(windows, feature = "wincon"))]
{
Ok(Self {
inner: StreamInner::Wincon(WinconStream::new(raw)),
})
}
#[cfg(not(all(windows, feature = "wincon")))]
{
Err(raw)
}
}
/// Get the wrapped [`RawStream`]
#[inline]
pub fn into_inner(self) -> S {
match self.inner {
StreamInner::PassThrough(w) => w,
StreamInner::Strip(w) => w.into_inner(),
#[cfg(all(windows, feature = "wincon"))]
StreamInner::Wincon(w) => w.into_inner(),
}
}
#[inline]
pub fn is_terminal(&self) -> bool {
match &self.inner {
StreamInner::PassThrough(w) => w.is_terminal(),
StreamInner::Strip(w) => w.is_terminal(),
#[cfg(all(windows, feature = "wincon"))]
StreamInner::Wincon(_) => true, // its only ever a terminal
}
}
/// Prefer [`AutoStream::choice`]
///
/// This doesn't report what is requested but what is currently active.
#[inline]
#[cfg(feature = "auto")]
pub fn current_choice(&self) -> ColorChoice {
match &self.inner {
StreamInner::PassThrough(_) => ColorChoice::AlwaysAnsi,
StreamInner::Strip(_) => ColorChoice::Never,
#[cfg(all(windows, feature = "wincon"))]
StreamInner::Wincon(_) => ColorChoice::Always,
}
}
}
#[cfg(feature = "auto")]
fn choice(raw: &dyn RawStream) -> ColorChoice {
let choice = ColorChoice::global();
match choice {
ColorChoice::Auto => {
let clicolor = anstyle_query::clicolor();
let clicolor_enabled = clicolor.unwrap_or(false);
let clicolor_disabled = !clicolor.unwrap_or(true);
if anstyle_query::no_color() {
ColorChoice::Never
} else if anstyle_query::clicolor_force() {
ColorChoice::Always
} else if clicolor_disabled {
ColorChoice::Never
} else if raw.is_terminal()
&& (anstyle_query::term_supports_color()
|| clicolor_enabled
|| anstyle_query::is_ci())
{
ColorChoice::Always
} else {
ColorChoice::Never
}
}
ColorChoice::AlwaysAnsi | ColorChoice::Always | ColorChoice::Never => choice,
}
}
impl AutoStream<std::io::Stdout> {
/// Get exclusive access to the `AutoStream`
///
/// Why?
/// - Faster performance when writing in a loop
/// - Avoid other threads interleaving output with the current thread
#[inline]
pub fn lock(self) -> AutoStream<std::io::StdoutLock<'static>> {
let inner = match self.inner {
StreamInner::PassThrough(w) => StreamInner::PassThrough(w.lock()),
StreamInner::Strip(w) => StreamInner::Strip(w.lock()),
#[cfg(all(windows, feature = "wincon"))]
StreamInner::Wincon(w) => StreamInner::Wincon(w.lock()),
};
AutoStream { inner }
}
}
impl AutoStream<std::io::Stderr> {
/// Get exclusive access to the `AutoStream`
///
/// Why?
/// - Faster performance when writing in a loop
/// - Avoid other threads interleaving output with the current thread
#[inline]
pub fn lock(self) -> AutoStream<std::io::StderrLock<'static>> {
let inner = match self.inner {
StreamInner::PassThrough(w) => StreamInner::PassThrough(w.lock()),
StreamInner::Strip(w) => StreamInner::Strip(w.lock()),
#[cfg(all(windows, feature = "wincon"))]
StreamInner::Wincon(w) => StreamInner::Wincon(w.lock()),
};
AutoStream { inner }
}
}
impl<S> std::io::Write for AutoStream<S>
where
S: RawStream + AsLockedWrite,
{
// Must forward all calls to ensure locking happens appropriately
#[inline]
fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
match &mut self.inner {
StreamInner::PassThrough(w) => w.as_locked_write().write(buf),
StreamInner::Strip(w) => w.write(buf),
#[cfg(all(windows, feature = "wincon"))]
StreamInner::Wincon(w) => w.write(buf),
}
}
#[inline]
fn write_vectored(&mut self, bufs: &[std::io::IoSlice<'_>]) -> std::io::Result<usize> {
match &mut self.inner {
StreamInner::PassThrough(w) => w.as_locked_write().write_vectored(bufs),
StreamInner::Strip(w) => w.write_vectored(bufs),
#[cfg(all(windows, feature = "wincon"))]
StreamInner::Wincon(w) => w.write_vectored(bufs),
}
}
// is_write_vectored: nightly only
#[inline]
fn flush(&mut self) -> std::io::Result<()> {
match &mut self.inner {
StreamInner::PassThrough(w) => w.as_locked_write().flush(),
StreamInner::Strip(w) => w.flush(),
#[cfg(all(windows, feature = "wincon"))]
StreamInner::Wincon(w) => w.flush(),
}
}
#[inline]
fn write_all(&mut self, buf: &[u8]) -> std::io::Result<()> {
match &mut self.inner {
StreamInner::PassThrough(w) => w.as_locked_write().write_all(buf),
StreamInner::Strip(w) => w.write_all(buf),
#[cfg(all(windows, feature = "wincon"))]
StreamInner::Wincon(w) => w.write_all(buf),
}
}
// write_all_vectored: nightly only
#[inline]
fn write_fmt(&mut self, args: std::fmt::Arguments<'_>) -> std::io::Result<()> {
match &mut self.inner {
StreamInner::PassThrough(w) => w.as_locked_write().write_fmt(args),
StreamInner::Strip(w) => w.write_fmt(args),
#[cfg(all(windows, feature = "wincon"))]
StreamInner::Wincon(w) => w.write_fmt(args),
}
}
}

68
vendor/anstream/src/buffer.rs vendored Normal file
View File

@ -0,0 +1,68 @@
#![allow(deprecated)]
/// In-memory [`RawStream`][crate::stream::RawStream]
#[derive(Clone, Default, Debug, PartialEq, Eq)]
#[deprecated(since = "0.6.2", note = "Use Vec")]
#[doc(hidden)]
pub struct Buffer(Vec<u8>);
impl Buffer {
#[inline]
pub fn new() -> Self {
Default::default()
}
#[inline]
pub fn with_capacity(capacity: usize) -> Self {
Self(Vec::with_capacity(capacity))
}
#[inline]
pub fn as_bytes(&self) -> &[u8] {
&self.0
}
}
impl AsRef<[u8]> for Buffer {
#[inline]
fn as_ref(&self) -> &[u8] {
self.as_bytes()
}
}
impl std::io::Write for Buffer {
#[inline]
fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
self.0.extend(buf);
Ok(buf.len())
}
#[inline]
fn flush(&mut self) -> std::io::Result<()> {
Ok(())
}
}
#[cfg(all(windows, feature = "wincon"))]
impl anstyle_wincon::WinconStream for Buffer {
fn write_colored(
&mut self,
fg: Option<anstyle::AnsiColor>,
bg: Option<anstyle::AnsiColor>,
data: &[u8],
) -> std::io::Result<usize> {
self.0.write_colored(fg, bg, data)
}
}
#[cfg(all(windows, feature = "wincon"))]
impl anstyle_wincon::WinconStream for &'_ mut Buffer {
fn write_colored(
&mut self,
fg: Option<anstyle::AnsiColor>,
bg: Option<anstyle::AnsiColor>,
data: &[u8],
) -> std::io::Result<usize> {
(**self).write_colored(fg, bg, data)
}
}

54
vendor/anstream/src/fmt.rs vendored Normal file
View File

@ -0,0 +1,54 @@
/// A shim which allows a [`std::io::Write`] to be implemented in terms of a [`std::fmt::Write`]
///
/// This saves off I/O errors. instead of discarding them
pub(crate) struct Adapter<W>
where
W: FnMut(&[u8]) -> std::io::Result<()>,
{
writer: W,
error: std::io::Result<()>,
}
impl<W> Adapter<W>
where
W: FnMut(&[u8]) -> std::io::Result<()>,
{
pub(crate) fn new(writer: W) -> Self {
Adapter {
writer,
error: Ok(()),
}
}
pub(crate) fn write_fmt(mut self, fmt: std::fmt::Arguments<'_>) -> std::io::Result<()> {
match std::fmt::write(&mut self, fmt) {
Ok(()) => Ok(()),
Err(..) => {
// check if the error came from the underlying `Write` or not
if self.error.is_err() {
self.error
} else {
Err(std::io::Error::new(
std::io::ErrorKind::Other,
"formatter error",
))
}
}
}
}
}
impl<W> std::fmt::Write for Adapter<W>
where
W: FnMut(&[u8]) -> std::io::Result<()>,
{
fn write_str(&mut self, s: &str) -> std::fmt::Result {
match (self.writer)(s.as_bytes()) {
Ok(()) => Ok(()),
Err(e) => {
self.error = Err(e);
Err(std::fmt::Error)
}
}
}
}

81
vendor/anstream/src/lib.rs vendored Normal file
View File

@ -0,0 +1,81 @@
//! **Auto-adapting [`stdout`] / [`stderr`] streams**
//!
//! *A portmanteau of "ansi stream"*
//!
//! [`AutoStream`] always accepts [ANSI escape codes](https://en.wikipedia.org/wiki/ANSI_escape_code),
//! [adapting to the user's terminal's capabilities][AutoStream].
//!
//! Benefits
//! - Allows the caller to not be concerned with the terminal's capabilities
//! - Semver safe way of passing styled text between crates as ANSI escape codes offer more
//! compatibility than most crate APIs.
//!
//! Available styling crates:
//! - [anstyle](https://docs.rs/anstyle) for minimal runtime styling, designed to go in public APIs
//! (once it hits 1.0)
//! - [owo-colors](https://docs.rs/owo-colors) for feature-rich runtime styling
//! - [color-print](https://docs.rs/color-print) for feature-rich compile-time styling
//!
//! # Example
//!
//! ```
//! # #[cfg(feature = "auto")] {
//! use anstream::println;
//! use owo_colors::OwoColorize as _;
//!
//! // Foreground colors
//! println!("My number is {:#x}!", 10.green());
//! // Background colors
//! println!("My number is not {}!", 4.on_red());
//! # }
//! ```
//!
//! And this will correctly handle piping to a file, etc
#![cfg_attr(docsrs, feature(doc_auto_cfg))]
pub mod adapter;
pub mod stream;
mod buffer;
#[macro_use]
mod macros;
mod auto;
mod fmt;
mod strip;
#[cfg(all(windows, feature = "wincon"))]
mod wincon;
pub use auto::AutoStream;
pub use strip::StripStream;
#[cfg(all(windows, feature = "wincon"))]
pub use wincon::WinconStream;
#[allow(deprecated)]
pub use buffer::Buffer;
pub type Stdout = AutoStream<std::io::Stdout>;
pub type Stderr = AutoStream<std::io::Stderr>;
/// Create an ANSI escape code compatible stdout
///
/// **Note:** Call [`AutoStream::lock`] in loops to avoid the performance hit of acquiring/releasing
/// from the implicit locking in each [`std::io::Write`] call
#[cfg(feature = "auto")]
pub fn stdout() -> Stdout {
let stdout = std::io::stdout();
AutoStream::auto(stdout)
}
/// Create an ANSI escape code compatible stderr
///
/// **Note:** Call [`AutoStream::lock`] in loops to avoid the performance hit of acquiring/releasing
/// from the implicit locking in each [`std::io::Write`] call
#[cfg(feature = "auto")]
pub fn stderr() -> Stderr {
let stderr = std::io::stderr();
AutoStream::auto(stderr)
}
/// Selection for overriding color output
pub use colorchoice::ColorChoice;

389
vendor/anstream/src/macros.rs vendored Normal file
View File

@ -0,0 +1,389 @@
/// Prints to [`stdout`][crate::stdout].
///
/// Equivalent to the [`println!`] macro except that a newline is not printed at
/// the end of the message.
///
/// Note that stdout is frequently line-buffered by default so it may be
/// necessary to use [`std::io::Write::flush()`] to ensure the output is emitted
/// immediately.
///
/// **NOTE:** The `print!` macro will lock the standard output on each call. If you call
/// `print!` within a hot loop, this behavior may be the bottleneck of the loop.
/// To avoid this, lock stdout with [`AutoStream::lock`][crate::AutoStream::lock]:
/// ```
/// # #[cfg(feature = "auto")] {
/// use std::io::Write as _;
///
/// let mut lock = anstream::stdout().lock();
/// write!(lock, "hello world").unwrap();
/// # }
/// ```
///
/// Use `print!` only for the primary output of your program. Use
/// [`eprint!`] instead to print error and progress messages.
///
/// **NOTE:** Not all `print!` calls will be captured in tests like [`std::print!`]
/// - Capturing will automatically be activated in test binaries
/// - Otherwise, only when the `test` feature is enabled
///
/// # Panics
///
/// Panics if writing to `stdout` fails for any reason **except** broken pipe.
///
/// Writing to non-blocking stdout can cause an error, which will lead
/// this macro to panic.
///
/// # Examples
///
/// ```
/// # #[cfg(feature = "auto")] {
/// use std::io::Write as _;
/// use anstream::print;
/// use anstream::stdout;
///
/// print!("this ");
/// print!("will ");
/// print!("be ");
/// print!("on ");
/// print!("the ");
/// print!("same ");
/// print!("line ");
///
/// stdout().flush().unwrap();
///
/// print!("this string has a newline, why not choose println! instead?\n");
///
/// stdout().flush().unwrap();
/// # }
/// ```
#[cfg(feature = "auto")]
#[macro_export]
macro_rules! print {
($($arg:tt)*) => {{
if cfg!(any(feature = "test", test)) {
use std::io::Write as _;
let stdio = std::io::stdout();
let choice = $crate::AutoStream::choice(&stdio);
let buffer = Vec::new();
let mut stream = $crate::AutoStream::new(buffer, choice);
// Ignore errors rather than panic
let _ = ::std::write!(&mut stream, $($arg)*);
let buffer = stream.into_inner();
// Should be UTF-8 but not wanting to panic
let buffer = String::from_utf8_lossy(&buffer);
::std::print!("{}", buffer)
} else {
use std::io::Write as _;
let mut stream = $crate::stdout();
match ::std::write!(&mut stream, $($arg)*) {
Err(e) if e.kind() != ::std::io::ErrorKind::BrokenPipe => {
::std::panic!("failed printing to stdout: {e}");
}
Err(_) | Ok(_) => {}
}
}
}};
}
/// Prints to [`stdout`][crate::stdout], with a newline.
///
/// On all platforms, the newline is the LINE FEED character (`\n`/`U+000A`) alone
/// (no additional CARRIAGE RETURN (`\r`/`U+000D`)).
///
/// This macro uses the same syntax as [`format!`], but writes to the standard output instead.
/// See [`std::fmt`] for more information.
///
/// **NOTE:** The `println!` macro will lock the standard output on each call. If you call
/// `println!` within a hot loop, this behavior may be the bottleneck of the loop.
/// To avoid this, lock stdout with [`AutoStream::lock`][crate::AutoStream::lock]:
/// ```
/// # #[cfg(feature = "auto")] {
/// use std::io::Write as _;
///
/// let mut lock = anstream::stdout().lock();
/// writeln!(lock, "hello world").unwrap();
/// # }
/// ```
///
/// Use `println!` only for the primary output of your program. Use
/// [`eprintln!`] instead to print error and progress messages.
///
/// **NOTE:** Not all `println!` calls will be captured in tests like [`std::println!`]
/// - Capturing will automatically be activated in test binaries
/// - Otherwise, only when the `test` feature is enabled
///
/// # Panics
///
/// Panics if writing to `stdout` fails for any reason **except** broken pipe.
///
/// Writing to non-blocking stdout can cause an error, which will lead
/// this macro to panic.
///
/// # Examples
///
/// ```
/// # #[cfg(feature = "auto")] {
/// use anstream::println;
///
/// println!(); // prints just a newline
/// println!("hello there!");
/// println!("format {} arguments", "some");
/// let local_variable = "some";
/// println!("format {local_variable} arguments");
/// # }
/// ```
#[cfg(feature = "auto")]
#[macro_export]
macro_rules! println {
() => {
$crate::print!("\n")
};
($($arg:tt)*) => {{
if cfg!(any(feature = "test", test)) {
use std::io::Write as _;
let stdio = std::io::stdout();
let choice = $crate::AutoStream::choice(&stdio);
let buffer = Vec::new();
let mut stream = $crate::AutoStream::new(buffer, choice);
// Ignore errors rather than panic
let _ = ::std::write!(&mut stream, $($arg)*);
let buffer = stream.into_inner();
// Should be UTF-8 but not wanting to panic
let buffer = String::from_utf8_lossy(&buffer);
::std::println!("{}", buffer)
} else {
use std::io::Write as _;
let mut stream = $crate::stdout();
match ::std::writeln!(&mut stream, $($arg)*) {
Err(e) if e.kind() != ::std::io::ErrorKind::BrokenPipe => {
::std::panic!("failed printing to stdout: {e}");
}
Err(_) | Ok(_) => {}
}
}
}};
}
/// Prints to [`stderr`][crate::stderr].
///
/// Equivalent to the [`print!`] macro, except that output goes to
/// `stderr` instead of `stdout`. See [`print!`] for
/// example usage.
///
/// Use `eprint!` only for error and progress messages. Use `print!`
/// instead for the primary output of your program.
///
/// **NOTE:** Not all `eprint!` calls will be captured in tests like [`std::eprint!`]
/// - Capturing will automatically be activated in test binaries
/// - Otherwise, only when the `test` feature is enabled
///
/// # Panics
///
/// Panics if writing to `stderr` fails for any reason **except** broken pipe.
///
/// Writing to non-blocking stdout can cause an error, which will lead
/// this macro to panic.
///
/// # Examples
///
/// ```
/// # #[cfg(feature = "auto")] {
/// use anstream::eprint;
///
/// eprint!("Error: Could not complete task");
/// # }
/// ```
#[cfg(feature = "auto")]
#[macro_export]
macro_rules! eprint {
($($arg:tt)*) => {{
if cfg!(any(feature = "test", test)) {
use std::io::Write as _;
let stdio = std::io::stderr();
let choice = $crate::AutoStream::choice(&stdio);
let buffer = Vec::new();
let mut stream = $crate::AutoStream::new(buffer, choice);
// Ignore errors rather than panic
let _ = ::std::write!(&mut stream, $($arg)*);
let buffer = stream.into_inner();
// Should be UTF-8 but not wanting to panic
let buffer = String::from_utf8_lossy(&buffer);
::std::eprint!("{}", buffer)
} else {
use std::io::Write as _;
let mut stream = $crate::stderr();
match ::std::write!(&mut stream, $($arg)*) {
Err(e) if e.kind() != ::std::io::ErrorKind::BrokenPipe => {
::std::panic!("failed printing to stdout: {e}");
}
Err(_) | Ok(_) => {}
}
}
}};
}
/// Prints to [`stderr`][crate::stderr], with a newline.
///
/// Equivalent to the [`println!`] macro, except that output goes to
/// `stderr` instead of `stdout`. See [`println!`] for
/// example usage.
///
/// Use `eprintln!` only for error and progress messages. Use `println!`
/// instead for the primary output of your program.
///
/// **NOTE:** Not all `eprintln!` calls will be captured in tests like [`std::eprintln!`]
/// - Capturing will automatically be activated in test binaries
/// - Otherwise, only when the `test` feature is enabled
///
/// # Panics
///
/// Panics if writing to `stderr` fails for any reason **except** broken pipe.
///
/// Writing to non-blocking stdout can cause an error, which will lead
/// this macro to panic.
///
/// # Examples
///
/// ```
/// # #[cfg(feature = "auto")] {
/// use anstream::eprintln;
///
/// eprintln!("Error: Could not complete task");
/// # }
/// ```
#[cfg(feature = "auto")]
#[macro_export]
macro_rules! eprintln {
() => {
$crate::eprint!("\n")
};
($($arg:tt)*) => {{
if cfg!(any(feature = "test", test)) {
use std::io::Write as _;
let stdio = std::io::stderr();
let choice = $crate::AutoStream::choice(&stdio);
let buffer = Vec::new();
let mut stream = $crate::AutoStream::new(buffer, choice);
// Ignore errors rather than panic
let _ = ::std::write!(&mut stream, $($arg)*);
let buffer = stream.into_inner();
// Should be UTF-8 but not wanting to panic
let buffer = String::from_utf8_lossy(&buffer);
::std::eprintln!("{}", buffer)
} else {
use std::io::Write as _;
let mut stream = $crate::stderr();
match ::std::writeln!(&mut stream, $($arg)*) {
Err(e) if e.kind() != ::std::io::ErrorKind::BrokenPipe => {
::std::panic!("failed printing to stdout: {e}");
}
Err(_) | Ok(_) => {}
}
}
}};
}
/// Panics the current thread.
///
/// This allows a program to terminate immediately and provide feedback
/// to the caller of the program.
///
/// This macro is the perfect way to assert conditions in example code and in
/// tests. `panic!` is closely tied with the `unwrap` method of both
/// [`Option`][ounwrap] and [`Result`][runwrap] enums. Both implementations call
/// `panic!` when they are set to [`None`] or [`Err`] variants.
///
/// When using `panic!()` you can specify a string payload, that is built using
/// the [`format!`] syntax. That payload is used when injecting the panic into
/// the calling Rust thread, causing the thread to panic entirely.
///
/// The behavior of the default `std` hook, i.e. the code that runs directly
/// after the panic is invoked, is to print the message payload to
/// `stderr` along with the file/line/column information of the `panic!()`
/// call. You can override the panic hook using [`std::panic::set_hook()`].
/// Inside the hook a panic can be accessed as a `&dyn Any + Send`,
/// which contains either a `&str` or `String` for regular `panic!()` invocations.
/// To panic with a value of another other type, [`panic_any`] can be used.
///
/// See also the macro [`compile_error!`], for raising errors during compilation.
///
/// # When to use `panic!` vs `Result`
///
/// The Rust language provides two complementary systems for constructing /
/// representing, reporting, propagating, reacting to, and discarding errors. These
/// responsibilities are collectively known as "error handling." `panic!` and
/// `Result` are similar in that they are each the primary interface of their
/// respective error handling systems; however, the meaning these interfaces attach
/// to their errors and the responsibilities they fulfill within their respective
/// error handling systems differ.
///
/// The `panic!` macro is used to construct errors that represent a bug that has
/// been detected in your program. With `panic!` you provide a message that
/// describes the bug and the language then constructs an error with that message,
/// reports it, and propagates it for you.
///
/// `Result` on the other hand is used to wrap other types that represent either
/// the successful result of some computation, `Ok(T)`, or error types that
/// represent an anticipated runtime failure mode of that computation, `Err(E)`.
/// `Result` is used alongside user defined types which represent the various
/// anticipated runtime failure modes that the associated computation could
/// encounter. `Result` must be propagated manually, often with the the help of the
/// `?` operator and `Try` trait, and they must be reported manually, often with
/// the help of the `Error` trait.
///
/// For more detailed information about error handling check out the [book] or the
/// [`std::result`] module docs.
///
/// [ounwrap]: Option::unwrap
/// [runwrap]: Result::unwrap
/// [`std::panic::set_hook()`]: ../std/panic/fn.set_hook.html
/// [`panic_any`]: ../std/panic/fn.panic_any.html
/// [`Box`]: ../std/boxed/struct.Box.html
/// [`Any`]: crate::any::Any
/// [`format!`]: ../std/macro.format.html
/// [book]: ../book/ch09-00-error-handling.html
/// [`std::result`]: ../std/result/index.html
///
/// # Current implementation
///
/// If the main thread panics it will terminate all your threads and end your
/// program with code `101`.
///
/// # Examples
///
/// ```should_panic
/// # #![allow(unreachable_code)]
/// use anstream::panic;
/// panic!();
/// panic!("this is a terrible mistake!");
/// panic!("this is a {} {message}", "fancy", message = "message");
/// ```
#[cfg(feature = "auto")]
#[macro_export]
macro_rules! panic {
() => {
::std::panic!()
};
($($arg:tt)*) => {{
use std::io::Write as _;
let panic_stream = std::io::stderr();
let choice = $crate::AutoStream::choice(&panic_stream);
let buffer = Vec::new();
let mut stream = $crate::AutoStream::new(buffer, choice);
// Ignore errors rather than panic
let _ = ::std::write!(&mut stream, $($arg)*);
let buffer = stream.into_inner();
// Should be UTF-8 but not wanting to panic
let buffer = String::from_utf8_lossy(&buffer).into_owned();
::std::panic!("{}", buffer)
}};
}

261
vendor/anstream/src/stream.rs vendored Normal file
View File

@ -0,0 +1,261 @@
//! Higher-level traits to describe writeable streams
/// Required functionality for underlying [`std::io::Write`] for adaptation
#[cfg(not(all(windows, feature = "wincon")))]
pub trait RawStream: std::io::Write + IsTerminal + private::Sealed {}
/// Required functionality for underlying [`std::io::Write`] for adaptation
#[cfg(all(windows, feature = "wincon"))]
pub trait RawStream:
std::io::Write + IsTerminal + anstyle_wincon::WinconStream + private::Sealed
{
}
impl RawStream for std::io::Stdout {}
impl RawStream for std::io::StdoutLock<'_> {}
impl RawStream for &'_ mut std::io::StdoutLock<'_> {}
impl RawStream for std::io::Stderr {}
impl RawStream for std::io::StderrLock<'_> {}
impl RawStream for &'_ mut std::io::StderrLock<'_> {}
impl RawStream for Box<dyn std::io::Write> {}
impl RawStream for &'_ mut Box<dyn std::io::Write> {}
impl RawStream for Vec<u8> {}
impl RawStream for &'_ mut Vec<u8> {}
impl RawStream for std::fs::File {}
impl RawStream for &'_ mut std::fs::File {}
#[allow(deprecated)]
impl RawStream for crate::Buffer {}
#[allow(deprecated)]
impl RawStream for &'_ mut crate::Buffer {}
pub trait IsTerminal: private::Sealed {
fn is_terminal(&self) -> bool;
}
impl IsTerminal for std::io::Stdout {
#[inline]
fn is_terminal(&self) -> bool {
std::io::IsTerminal::is_terminal(self)
}
}
impl IsTerminal for std::io::StdoutLock<'_> {
#[inline]
fn is_terminal(&self) -> bool {
std::io::IsTerminal::is_terminal(self)
}
}
impl IsTerminal for &'_ mut std::io::StdoutLock<'_> {
#[inline]
fn is_terminal(&self) -> bool {
(**self).is_terminal()
}
}
impl IsTerminal for std::io::Stderr {
#[inline]
fn is_terminal(&self) -> bool {
std::io::IsTerminal::is_terminal(self)
}
}
impl IsTerminal for std::io::StderrLock<'_> {
#[inline]
fn is_terminal(&self) -> bool {
std::io::IsTerminal::is_terminal(self)
}
}
impl IsTerminal for &'_ mut std::io::StderrLock<'_> {
#[inline]
fn is_terminal(&self) -> bool {
(**self).is_terminal()
}
}
impl IsTerminal for Box<dyn std::io::Write> {
#[inline]
fn is_terminal(&self) -> bool {
false
}
}
impl IsTerminal for &'_ mut Box<dyn std::io::Write> {
#[inline]
fn is_terminal(&self) -> bool {
false
}
}
impl IsTerminal for Vec<u8> {
#[inline]
fn is_terminal(&self) -> bool {
false
}
}
impl IsTerminal for &'_ mut Vec<u8> {
#[inline]
fn is_terminal(&self) -> bool {
false
}
}
impl IsTerminal for std::fs::File {
#[inline]
fn is_terminal(&self) -> bool {
std::io::IsTerminal::is_terminal(self)
}
}
impl IsTerminal for &'_ mut std::fs::File {
#[inline]
fn is_terminal(&self) -> bool {
(**self).is_terminal()
}
}
#[allow(deprecated)]
impl IsTerminal for crate::Buffer {
#[inline]
fn is_terminal(&self) -> bool {
false
}
}
#[allow(deprecated)]
impl IsTerminal for &'_ mut crate::Buffer {
#[inline]
fn is_terminal(&self) -> bool {
(**self).is_terminal()
}
}
pub trait AsLockedWrite: private::Sealed {
type Write<'w>: RawStream + 'w
where
Self: 'w;
fn as_locked_write(&mut self) -> Self::Write<'_>;
}
impl AsLockedWrite for std::io::Stdout {
type Write<'w> = std::io::StdoutLock<'w>;
#[inline]
fn as_locked_write(&mut self) -> Self::Write<'_> {
self.lock()
}
}
impl AsLockedWrite for std::io::StdoutLock<'static> {
type Write<'w> = &'w mut Self;
#[inline]
fn as_locked_write(&mut self) -> Self::Write<'_> {
self
}
}
impl AsLockedWrite for std::io::Stderr {
type Write<'w> = std::io::StderrLock<'w>;
#[inline]
fn as_locked_write(&mut self) -> Self::Write<'_> {
self.lock()
}
}
impl AsLockedWrite for std::io::StderrLock<'static> {
type Write<'w> = &'w mut Self;
#[inline]
fn as_locked_write(&mut self) -> Self::Write<'_> {
self
}
}
impl AsLockedWrite for Box<dyn std::io::Write> {
type Write<'w> = &'w mut Self;
#[inline]
fn as_locked_write(&mut self) -> Self::Write<'_> {
self
}
}
impl AsLockedWrite for Vec<u8> {
type Write<'w> = &'w mut Self;
#[inline]
fn as_locked_write(&mut self) -> Self::Write<'_> {
self
}
}
impl AsLockedWrite for std::fs::File {
type Write<'w> = &'w mut Self;
#[inline]
fn as_locked_write(&mut self) -> Self::Write<'_> {
self
}
}
#[allow(deprecated)]
impl AsLockedWrite for crate::Buffer {
type Write<'w> = &'w mut Self;
#[inline]
fn as_locked_write(&mut self) -> Self::Write<'_> {
self
}
}
mod private {
pub trait Sealed {}
impl Sealed for std::io::Stdout {}
impl Sealed for std::io::StdoutLock<'_> {}
impl Sealed for &'_ mut std::io::StdoutLock<'_> {}
impl Sealed for std::io::Stderr {}
impl Sealed for std::io::StderrLock<'_> {}
impl Sealed for &'_ mut std::io::StderrLock<'_> {}
impl Sealed for Box<dyn std::io::Write> {}
impl Sealed for &'_ mut Box<dyn std::io::Write> {}
impl Sealed for Vec<u8> {}
impl Sealed for &'_ mut Vec<u8> {}
impl Sealed for std::fs::File {}
impl Sealed for &'_ mut std::fs::File {}
#[allow(deprecated)]
impl Sealed for crate::Buffer {}
#[allow(deprecated)]
impl Sealed for &'_ mut crate::Buffer {}
}

226
vendor/anstream/src/strip.rs vendored Normal file
View File

@ -0,0 +1,226 @@
use crate::adapter::StripBytes;
use crate::stream::AsLockedWrite;
use crate::stream::IsTerminal;
/// Only pass printable data to the inner `Write`
#[derive(Debug)]
pub struct StripStream<S>
where
S: std::io::Write,
{
raw: S,
state: StripBytes,
}
impl<S> StripStream<S>
where
S: std::io::Write,
{
/// Only pass printable data to the inner `Write`
#[inline]
pub fn new(raw: S) -> Self {
Self {
raw,
state: Default::default(),
}
}
/// Get the wrapped [`std::io::Write`]
#[inline]
pub fn into_inner(self) -> S {
self.raw
}
}
impl<S> StripStream<S>
where
S: std::io::Write,
S: IsTerminal,
{
#[inline]
pub fn is_terminal(&self) -> bool {
self.raw.is_terminal()
}
}
impl StripStream<std::io::Stdout> {
/// Get exclusive access to the `StripStream`
///
/// Why?
/// - Faster performance when writing in a loop
/// - Avoid other threads interleaving output with the current thread
#[inline]
pub fn lock(self) -> StripStream<std::io::StdoutLock<'static>> {
StripStream {
raw: self.raw.lock(),
state: self.state,
}
}
}
impl StripStream<std::io::Stderr> {
/// Get exclusive access to the `StripStream`
///
/// Why?
/// - Faster performance when writing in a loop
/// - Avoid other threads interleaving output with the current thread
#[inline]
pub fn lock(self) -> StripStream<std::io::StderrLock<'static>> {
StripStream {
raw: self.raw.lock(),
state: self.state,
}
}
}
impl<S> std::io::Write for StripStream<S>
where
S: std::io::Write,
S: AsLockedWrite,
{
// Must forward all calls to ensure locking happens appropriately
#[inline]
fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
write(&mut self.raw.as_locked_write(), &mut self.state, buf)
}
#[inline]
fn write_vectored(&mut self, bufs: &[std::io::IoSlice<'_>]) -> std::io::Result<usize> {
let buf = bufs
.iter()
.find(|b| !b.is_empty())
.map(|b| &**b)
.unwrap_or(&[][..]);
self.write(buf)
}
// is_write_vectored: nightly only
#[inline]
fn flush(&mut self) -> std::io::Result<()> {
self.raw.as_locked_write().flush()
}
#[inline]
fn write_all(&mut self, buf: &[u8]) -> std::io::Result<()> {
write_all(&mut self.raw.as_locked_write(), &mut self.state, buf)
}
// write_all_vectored: nightly only
#[inline]
fn write_fmt(&mut self, args: std::fmt::Arguments<'_>) -> std::io::Result<()> {
write_fmt(&mut self.raw.as_locked_write(), &mut self.state, args)
}
}
fn write(
raw: &mut dyn std::io::Write,
state: &mut StripBytes,
buf: &[u8],
) -> std::io::Result<usize> {
let initial_state = state.clone();
for printable in state.strip_next(buf) {
let possible = printable.len();
let written = raw.write(printable)?;
if possible != written {
let divergence = &printable[written..];
let offset = offset_to(buf, divergence);
let consumed = &buf[offset..];
*state = initial_state;
state.strip_next(consumed).last();
return Ok(offset);
}
}
Ok(buf.len())
}
fn write_all(
raw: &mut dyn std::io::Write,
state: &mut StripBytes,
buf: &[u8],
) -> std::io::Result<()> {
for printable in state.strip_next(buf) {
raw.write_all(printable)?;
}
Ok(())
}
fn write_fmt(
raw: &mut dyn std::io::Write,
state: &mut StripBytes,
args: std::fmt::Arguments<'_>,
) -> std::io::Result<()> {
let write_all = |buf: &[u8]| write_all(raw, state, buf);
crate::fmt::Adapter::new(write_all).write_fmt(args)
}
#[inline]
fn offset_to(total: &[u8], subslice: &[u8]) -> usize {
let total = total.as_ptr();
let subslice = subslice.as_ptr();
debug_assert!(
total <= subslice,
"`Offset::offset_to` only accepts slices of `self`"
);
subslice as usize - total as usize
}
#[cfg(test)]
mod test {
use super::*;
use proptest::prelude::*;
use std::io::Write as _;
proptest! {
#[test]
#[cfg_attr(miri, ignore)] // See https://github.com/AltSysrq/proptest/issues/253
fn write_all_no_escapes(s in "\\PC*") {
let buffer = Vec::new();
let mut stream = StripStream::new(buffer);
stream.write_all(s.as_bytes()).unwrap();
let buffer = stream.into_inner();
let actual = std::str::from_utf8(buffer.as_ref()).unwrap();
assert_eq!(s, actual);
}
#[test]
#[cfg_attr(miri, ignore)] // See https://github.com/AltSysrq/proptest/issues/253
fn write_byte_no_escapes(s in "\\PC*") {
let buffer = Vec::new();
let mut stream = StripStream::new(buffer);
for byte in s.as_bytes() {
stream.write_all(&[*byte]).unwrap();
}
let buffer = stream.into_inner();
let actual = std::str::from_utf8(buffer.as_ref()).unwrap();
assert_eq!(s, actual);
}
#[test]
#[cfg_attr(miri, ignore)] // See https://github.com/AltSysrq/proptest/issues/253
fn write_all_random(s in any::<Vec<u8>>()) {
let buffer = Vec::new();
let mut stream = StripStream::new(buffer);
stream.write_all(s.as_slice()).unwrap();
let buffer = stream.into_inner();
if let Ok(actual) = std::str::from_utf8(buffer.as_ref()) {
for char in actual.chars() {
assert!(!char.is_ascii() || !char.is_control() || char.is_ascii_whitespace(), "{:?} -> {:?}: {:?}", String::from_utf8_lossy(&s), actual, char);
}
}
}
#[test]
#[cfg_attr(miri, ignore)] // See https://github.com/AltSysrq/proptest/issues/253
fn write_byte_random(s in any::<Vec<u8>>()) {
let buffer = Vec::new();
let mut stream = StripStream::new(buffer);
for byte in s.as_slice() {
stream.write_all(&[*byte]).unwrap();
}
let buffer = stream.into_inner();
if let Ok(actual) = std::str::from_utf8(buffer.as_ref()) {
for char in actual.chars() {
assert!(!char.is_ascii() || !char.is_control() || char.is_ascii_whitespace(), "{:?} -> {:?}: {:?}", String::from_utf8_lossy(&s), actual, char);
}
}
}
}
}

225
vendor/anstream/src/wincon.rs vendored Normal file
View File

@ -0,0 +1,225 @@
use crate::adapter::WinconBytes;
use crate::stream::AsLockedWrite;
use crate::stream::IsTerminal;
/// Only pass printable data to the inner `Write`
#[cfg(feature = "wincon")] // here mostly for documentation purposes
#[derive(Debug)]
pub struct WinconStream<S>
where
S: anstyle_wincon::WinconStream,
{
raw: S,
// `WinconBytes` is especially large compared to other variants of `AutoStream`, so boxing it
// here so `AutoStream` doesn't have to discard one allocation and create another one when
// calling `AutoStream::lock`
state: Box<WinconBytes>,
}
impl<S> WinconStream<S>
where
S: anstyle_wincon::WinconStream,
{
/// Only pass printable data to the inner `Write`
#[inline]
pub fn new(raw: S) -> Self {
Self {
raw,
state: Default::default(),
}
}
/// Get the wrapped [`anstyle_wincon::WinconStream`]
#[inline]
pub fn into_inner(self) -> S {
self.raw
}
}
impl<S> WinconStream<S>
where
S: anstyle_wincon::WinconStream,
S: IsTerminal,
{
#[inline]
pub fn is_terminal(&self) -> bool {
self.raw.is_terminal()
}
}
impl WinconStream<std::io::Stdout> {
/// Get exclusive access to the `WinconStream`
///
/// Why?
/// - Faster performance when writing in a loop
/// - Avoid other threads interleaving output with the current thread
#[inline]
pub fn lock(self) -> WinconStream<std::io::StdoutLock<'static>> {
WinconStream {
raw: self.raw.lock(),
state: self.state,
}
}
}
impl WinconStream<std::io::Stderr> {
/// Get exclusive access to the `WinconStream`
///
/// Why?
/// - Faster performance when writing in a loop
/// - Avoid other threads interleaving output with the current thread
#[inline]
pub fn lock(self) -> WinconStream<std::io::StderrLock<'static>> {
WinconStream {
raw: self.raw.lock(),
state: self.state,
}
}
}
impl<S> std::io::Write for WinconStream<S>
where
S: anstyle_wincon::WinconStream,
S: AsLockedWrite,
{
// Must forward all calls to ensure locking happens appropriately
#[inline]
fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
write(&mut self.raw.as_locked_write(), &mut self.state, buf)
}
#[inline]
fn write_vectored(&mut self, bufs: &[std::io::IoSlice<'_>]) -> std::io::Result<usize> {
let buf = bufs
.iter()
.find(|b| !b.is_empty())
.map(|b| &**b)
.unwrap_or(&[][..]);
self.write(buf)
}
// is_write_vectored: nightly only
#[inline]
fn flush(&mut self) -> std::io::Result<()> {
self.raw.as_locked_write().flush()
}
#[inline]
fn write_all(&mut self, buf: &[u8]) -> std::io::Result<()> {
write_all(&mut self.raw.as_locked_write(), &mut self.state, buf)
}
// write_all_vectored: nightly only
#[inline]
fn write_fmt(&mut self, args: std::fmt::Arguments<'_>) -> std::io::Result<()> {
write_fmt(&mut self.raw.as_locked_write(), &mut self.state, args)
}
}
fn write(
raw: &mut dyn anstyle_wincon::WinconStream,
state: &mut WinconBytes,
buf: &[u8],
) -> std::io::Result<usize> {
for (style, printable) in state.extract_next(buf) {
let fg = style.get_fg_color().and_then(cap_wincon_color);
let bg = style.get_bg_color().and_then(cap_wincon_color);
let written = raw.write_colored(fg, bg, printable.as_bytes())?;
let possible = printable.len();
if possible != written {
// HACK: Unsupported atm
break;
}
}
Ok(buf.len())
}
fn write_all(
raw: &mut dyn anstyle_wincon::WinconStream,
state: &mut WinconBytes,
buf: &[u8],
) -> std::io::Result<()> {
for (style, printable) in state.extract_next(buf) {
let mut buf = printable.as_bytes();
let fg = style.get_fg_color().and_then(cap_wincon_color);
let bg = style.get_bg_color().and_then(cap_wincon_color);
while !buf.is_empty() {
match raw.write_colored(fg, bg, buf) {
Ok(0) => {
return Err(std::io::Error::new(
std::io::ErrorKind::WriteZero,
"failed to write whole buffer",
));
}
Ok(n) => buf = &buf[n..],
Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {}
Err(e) => return Err(e),
}
}
}
Ok(())
}
fn write_fmt(
raw: &mut dyn anstyle_wincon::WinconStream,
state: &mut WinconBytes,
args: std::fmt::Arguments<'_>,
) -> std::io::Result<()> {
let write_all = |buf: &[u8]| write_all(raw, state, buf);
crate::fmt::Adapter::new(write_all).write_fmt(args)
}
fn cap_wincon_color(color: anstyle::Color) -> Option<anstyle::AnsiColor> {
match color {
anstyle::Color::Ansi(c) => Some(c),
anstyle::Color::Ansi256(c) => c.into_ansi(),
anstyle::Color::Rgb(_) => None,
}
}
#[cfg(test)]
mod test {
use super::*;
use proptest::prelude::*;
use std::io::Write as _;
proptest! {
#[test]
#[cfg_attr(miri, ignore)] // See https://github.com/AltSysrq/proptest/issues/253
fn write_all_no_escapes(s in "\\PC*") {
let buffer = Vec::new();
let mut stream = WinconStream::new(buffer);
stream.write_all(s.as_bytes()).unwrap();
let buffer = stream.into_inner();
let actual = std::str::from_utf8(buffer.as_ref()).unwrap();
assert_eq!(s, actual);
}
#[test]
#[cfg_attr(miri, ignore)] // See https://github.com/AltSysrq/proptest/issues/253
fn write_byte_no_escapes(s in "\\PC*") {
let buffer = Vec::new();
let mut stream = WinconStream::new(buffer);
for byte in s.as_bytes() {
stream.write_all(&[*byte]).unwrap();
}
let buffer = stream.into_inner();
let actual = std::str::from_utf8(buffer.as_ref()).unwrap();
assert_eq!(s, actual);
}
#[test]
#[cfg_attr(miri, ignore)] // See https://github.com/AltSysrq/proptest/issues/253
fn write_all_random(s in any::<Vec<u8>>()) {
let buffer = Vec::new();
let mut stream = WinconStream::new(buffer);
stream.write_all(s.as_slice()).unwrap();
}
#[test]
#[cfg_attr(miri, ignore)] // See https://github.com/AltSysrq/proptest/issues/253
fn write_byte_random(s in any::<Vec<u8>>()) {
let buffer = Vec::new();
let mut stream = WinconStream::new(buffer);
for byte in s.as_slice() {
stream.write_all(&[*byte]).unwrap();
}
}
}
}

View File

@ -0,0 +1 @@
{"files":{"Cargo.lock":"7f68b5328c460caf1d2198b10fe1761e5f0282262f92d04076b30b25539970b0","Cargo.toml":"2834f39b7169c03b03da1e209f56133783ce00ea64d5f2c14381d93984ca20bf","LICENSE-APACHE":"b40930bbcf80744c86c46a12bc9da056641d722716c378f5659b9e555ef833e1","LICENSE-MIT":"c1d4bc00896473e0109ccb4c3c7d21addb55a4ff1a644be204dcfce26612af2a","README.md":"abc82171d436ee0eb221838e8d21a21a2e392504e87f0c130b5eca6a35671e1e","benches/parse.rs":"336c808d51c90db2497fa87e571df7f71c844a1b09be88839fe4255066c632f4","examples/parselog.rs":"58b7db739deed701aa0ab386d0d0c1772511b8aed1c08d31ec5b35a1c8cd4321","src/lib.rs":"c89f2afa0e982276dc47ca8d8a76d47516aa39aa9d3354254c87fdbf2f8ef4cc","src/params.rs":"8cfef4e2ab1961ca2d9f210da553fc6ac64bb6dbd03321f0ee7d6089ab45389c","src/state/codegen.rs":"8530124c8f998f391e47950f130590376321dcade810990f4312c3b1c0a61968","src/state/definitions.rs":"dc3dbb3244def74430a72b0108f019e22cc02e0ae5f563ee14d38300ff82b814","src/state/mod.rs":"be07c2ea393a971dd54117dc2ce8a3ffb5b803cb557ab468389b74570855fa37","src/state/table.rs":"673b7e9242c5248efc076086cc6923578ec2f059c0c26da21363528e20e4285c"},"package":"c75ac65da39e5fe5ab759307499ddad880d724eed2f6ce5b5e8a26f4f387928c"}

1202
vendor/anstyle-parse/Cargo.lock generated vendored Normal file

File diff suppressed because it is too large Load Diff

108
vendor/anstyle-parse/Cargo.toml vendored Normal file
View File

@ -0,0 +1,108 @@
# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO
#
# When uploading crates to the registry Cargo will automatically
# "normalize" Cargo.toml files for maximal compatibility
# with all versions of Cargo and also rewrite `path` dependencies
# to registry (e.g., crates.io) dependencies.
#
# If you are reading this file be aware that the original Cargo.toml
# will likely look very different (and much more reasonable).
# See Cargo.toml.orig for the original contents.
[package]
edition = "2021"
rust-version = "1.70.0"
name = "anstyle-parse"
version = "0.2.3"
include = [
"build.rs",
"src/**/*",
"Cargo.toml",
"Cargo.lock",
"LICENSE*",
"README.md",
"benches/**/*",
"examples/**/*",
]
description = "Parse ANSI Style Escapes"
homepage = "https://github.com/rust-cli/anstyle"
readme = "README.md"
keywords = [
"ansi",
"terminal",
"color",
"vte",
]
categories = ["command-line-interface"]
license = "MIT OR Apache-2.0"
repository = "https://github.com/rust-cli/anstyle.git"
[[package.metadata.release.pre-release-replacements]]
file = "CHANGELOG.md"
min = 1
replace = "{{version}}"
search = "Unreleased"
[[package.metadata.release.pre-release-replacements]]
exactly = 1
file = "CHANGELOG.md"
replace = "...{{tag_name}}"
search = '\.\.\.HEAD'
[[package.metadata.release.pre-release-replacements]]
file = "CHANGELOG.md"
min = 1
replace = "{{date}}"
search = "ReleaseDate"
[[package.metadata.release.pre-release-replacements]]
exactly = 1
file = "CHANGELOG.md"
replace = """
<!-- next-header -->
## [Unreleased] - ReleaseDate
"""
search = "<!-- next-header -->"
[[package.metadata.release.pre-release-replacements]]
exactly = 1
file = "CHANGELOG.md"
replace = """
<!-- next-url -->
[Unreleased]: https://github.com/rust-cli/anstyle/compare/{{tag_name}}...HEAD"""
search = "<!-- next-url -->"
[[bench]]
name = "parse"
harness = false
[dependencies.arrayvec]
version = "0.7.2"
optional = true
default-features = false
[dependencies.utf8parse]
version = "0.2.1"
optional = true
[dev-dependencies.codegenrs]
version = "3.0.1"
default-features = false
[dev-dependencies.criterion]
version = "0.5.1"
[dev-dependencies.proptest]
version = "1.4.0"
[dev-dependencies.snapbox]
version = "0.4.14"
features = ["path"]
[dev-dependencies.vte_generate_state_changes]
version = "0.1.1"
[features]
core = ["dep:arrayvec"]
default = ["utf8"]
utf8 = ["dep:utf8parse"]

201
vendor/anstyle-parse/LICENSE-APACHE vendored Normal file
View File

@ -0,0 +1,201 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "{}"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright {yyyy} {name of copyright owner}
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

25
vendor/anstyle-parse/LICENSE-MIT vendored Normal file
View File

@ -0,0 +1,25 @@
Copyright (c) 2016 Joe Wilm and individual contributors
Permission is hereby granted, free of charge, to any
person obtaining a copy of this software and associated
documentation files (the "Software"), to deal in the
Software without restriction, including without
limitation the rights to use, copy, modify, merge,
publish, distribute, sublicense, and/or sell copies of
the Software, and to permit persons to whom the Software
is furnished to do so, subject to the following
conditions:
The above copyright notice and this permission notice
shall be included in all copies or substantial portions
of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.

33
vendor/anstyle-parse/README.md vendored Normal file
View File

@ -0,0 +1,33 @@
# anstyle-parse
> Parse [Parse ANSI Style Escapes](https://vt100.net/emu/dec_ansi_parser)
[![Documentation](https://img.shields.io/badge/docs-master-blue.svg)][Documentation]
![License](https://img.shields.io/crates/l/anstyle-parse.svg)
[![Crates Status](https://img.shields.io/crates/v/anstyle-parse.svg)](https://crates.io/crates/anstyle-parse)
## License
Licensed under either of
* Apache License, Version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
* MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
at your option.
### Contribution
Unless you explicitly state otherwise, any contribution intentionally
submitted for inclusion in the work by you, as defined in the Apache-2.0
license, shall be dual licensed as above, without any additional terms or
conditions.
### Special Thanks
[chrisduerr](https://github.com/alacritty/vte/commits?author=chrisduerr) and the
[alacritty project](https://github.com/alacritty/alacritty) for
[vte](https://crates.io/crates/vte) which
[this was forked from](https://github.com/alacritty/vte/issues/82)
[Crates.io]: https://crates.io/crates/anstyle-parse
[Documentation]: https://docs.rs/anstyle-parse

169
vendor/anstyle-parse/benches/parse.rs vendored Normal file
View File

@ -0,0 +1,169 @@
use criterion::{black_box, Criterion};
use anstyle_parse::*;
struct BenchDispatcher;
impl Perform for BenchDispatcher {
fn print(&mut self, c: char) {
black_box(c);
}
fn execute(&mut self, byte: u8) {
black_box(byte);
}
fn hook(&mut self, params: &Params, intermediates: &[u8], ignore: bool, c: u8) {
black_box((params, intermediates, ignore, c));
}
fn put(&mut self, byte: u8) {
black_box(byte);
}
fn osc_dispatch(&mut self, params: &[&[u8]], bell_terminated: bool) {
black_box((params, bell_terminated));
}
fn csi_dispatch(&mut self, params: &Params, intermediates: &[u8], ignore: bool, c: u8) {
black_box((params, intermediates, ignore, c));
}
fn esc_dispatch(&mut self, intermediates: &[u8], ignore: bool, byte: u8) {
black_box((intermediates, ignore, byte));
}
}
#[derive(Default)]
struct Strip(String);
impl Strip {
fn with_capacity(capacity: usize) -> Self {
Self(String::with_capacity(capacity))
}
}
impl Perform for Strip {
fn print(&mut self, c: char) {
self.0.push(c);
}
fn execute(&mut self, byte: u8) {
if byte.is_ascii_whitespace() {
self.0.push(byte as char);
}
}
}
fn strip_str(content: &str) -> String {
use anstyle_parse::state::state_change;
use anstyle_parse::state::Action;
use anstyle_parse::state::State;
#[inline]
fn is_utf8_continuation(b: u8) -> bool {
matches!(b, 0x80..=0xbf)
}
#[inline]
fn is_printable(action: Action, byte: u8) -> bool {
action == Action::Print
|| action == Action::BeginUtf8
// since we know the input is valid UTF-8, the only thing we can do with
// continuations is to print them
|| is_utf8_continuation(byte)
|| (action == Action::Execute && byte.is_ascii_whitespace())
}
let mut stripped = Vec::with_capacity(content.len());
let mut bytes = content.as_bytes();
while !bytes.is_empty() {
let offset = bytes.iter().copied().position(|b| {
let (_next_state, action) = state_change(State::Ground, b);
!is_printable(action, b)
});
let (printable, next) = bytes.split_at(offset.unwrap_or(bytes.len()));
stripped.extend(printable);
bytes = next;
let mut state = State::Ground;
let offset = bytes.iter().copied().position(|b| {
let (next_state, action) = state_change(state, b);
if next_state != State::Anywhere {
state = next_state;
}
is_printable(action, b)
});
let (_, next) = bytes.split_at(offset.unwrap_or(bytes.len()));
bytes = next;
}
String::from_utf8(stripped).unwrap()
}
fn parse(c: &mut Criterion) {
for (name, content) in [
#[cfg(feature = "utf8")]
("demo.vte", &include_bytes!("../tests/demo.vte")[..]),
("rg_help.vte", &include_bytes!("../tests/rg_help.vte")[..]),
("rg_linus.vte", &include_bytes!("../tests/rg_linus.vte")[..]),
(
"state_changes",
&b"\x1b]2;X\x1b\\ \x1b[0m \x1bP0@\x1b\\"[..],
),
] {
// Make sure the comparison is fair
if let Ok(content) = std::str::from_utf8(content) {
let mut stripped = Strip::with_capacity(content.len());
let mut parser = Parser::<DefaultCharAccumulator>::new();
for byte in content.as_bytes() {
parser.advance(&mut stripped, *byte);
}
assert_eq!(stripped.0, strip_str(content));
}
let mut group = c.benchmark_group(name);
group.bench_function("advance", |b| {
b.iter(|| {
let mut dispatcher = BenchDispatcher;
let mut parser = Parser::<DefaultCharAccumulator>::new();
for byte in content {
parser.advance(&mut dispatcher, *byte);
}
})
});
group.bench_function("advance_strip", |b| {
b.iter(|| {
let mut stripped = Strip::with_capacity(content.len());
let mut parser = Parser::<DefaultCharAccumulator>::new();
for byte in content {
parser.advance(&mut stripped, *byte);
}
black_box(stripped.0)
})
});
group.bench_function("state_change", |b| {
b.iter(|| {
let mut state = anstyle_parse::state::State::default();
for byte in content {
let (next_state, action) = anstyle_parse::state::state_change(state, *byte);
state = next_state;
black_box(action);
}
})
});
if let Ok(content) = std::str::from_utf8(content) {
group.bench_function("state_change_strip_str", |b| {
b.iter(|| {
let stripped = strip_str(content);
black_box(stripped)
})
});
}
}
}
criterion::criterion_group!(benches, parse);
criterion::criterion_main!(benches);

View File

@ -0,0 +1,78 @@
//! Parse input from stdin and log actions on stdout
use std::io::{self, Read};
use anstyle_parse::{DefaultCharAccumulator, Params, Parser, Perform};
/// A type implementing Perform that just logs actions
struct Log;
impl Perform for Log {
fn print(&mut self, c: char) {
println!("[print] {:?}", c);
}
fn execute(&mut self, byte: u8) {
println!("[execute] {:02x}", byte);
}
fn hook(&mut self, params: &Params, intermediates: &[u8], ignore: bool, c: u8) {
println!(
"[hook] params={:?}, intermediates={:?}, ignore={:?}, char={:?}",
params, intermediates, ignore, c
);
}
fn put(&mut self, byte: u8) {
println!("[put] {:02x}", byte);
}
fn unhook(&mut self) {
println!("[unhook]");
}
fn osc_dispatch(&mut self, params: &[&[u8]], bell_terminated: bool) {
println!(
"[osc_dispatch] params={:?} bell_terminated={}",
params, bell_terminated
);
}
fn csi_dispatch(&mut self, params: &Params, intermediates: &[u8], ignore: bool, c: u8) {
println!(
"[csi_dispatch] params={:#?}, intermediates={:?}, ignore={:?}, char={:?}",
params, intermediates, ignore, c
);
}
fn esc_dispatch(&mut self, intermediates: &[u8], ignore: bool, byte: u8) {
println!(
"[esc_dispatch] intermediates={:?}, ignore={:?}, byte={:02x}",
intermediates, ignore, byte
);
}
}
fn main() {
let input = io::stdin();
let mut handle = input.lock();
let mut statemachine = Parser::<DefaultCharAccumulator>::new();
let mut performer = Log;
let mut buf = [0; 2048];
loop {
match handle.read(&mut buf) {
Ok(0) => break,
Ok(n) => {
for byte in &buf[..n] {
statemachine.advance(&mut performer, *byte);
}
}
Err(err) => {
println!("err: {}", err);
break;
}
}
}
}

Some files were not shown because too many files have changed in this diff Show More