0.11.1-alt1
- 0.11.1
This commit is contained in:
parent
0e2054b322
commit
576b4e53aa
@ -1 +1,2 @@
|
||||
tar: v@version@:.
|
||||
tar: v@version@:.
|
||||
tar: vendor name=@name@-@version@-vendor base=vendor/
|
@ -10,6 +10,7 @@ Url: https://github.com/typst/typst
|
||||
Packager: Sergey Konev <konevsa@altlinux.org>
|
||||
|
||||
Source: %name-%version.tar
|
||||
Source1: %name-%version-vendor.tar
|
||||
|
||||
BuildRequires(pre): rpm-build-rust
|
||||
BuildRequires: /proc
|
||||
|
1
vendor/adler/.cargo-checksum.json
vendored
Normal file
1
vendor/adler/.cargo-checksum.json
vendored
Normal file
@ -0,0 +1 @@
|
||||
{"files":{"CHANGELOG.md":"737088e45fdf27fe2cfedce163332d8ce08c58fd86ca287de2de34c0fbaf63e7","Cargo.toml":"f410869f0f1a5697f65a8a77be03da7aeecc0be26e7cf3a1feb1acaa4f518770","LICENSE-0BSD":"861399f8c21c042b110517e76dc6b63a2b334276c8cf17412fc3c8908ca8dc17","LICENSE-APACHE":"8ada45cd9f843acf64e4722ae262c622a2b3b3007c7310ef36ac1061a30f6adb","LICENSE-MIT":"23f18e03dc49df91622fe2a76176497404e46ced8a715d9d2b67a7446571cca3","README.md":"308c50cdb42b9573743068158339570b45ca3f895015ca3b87ba983edb0a21e6","RELEASE_PROCESS.md":"a86cd10fc70f167f8d00e9e4ce0c6b4ebdfa1865058390dffd1e0ad4d3e68d9d","benches/bench.rs":"c07ce370e3680c602e415f8d1ec4e543ea2163ab22a09b6b82d93e8a30adca82","src/algo.rs":"b664b131f724a809591394a10b9023f40ab5963e32a83fa3163c2668e59c8b66","src/lib.rs":"b55ba9c629b30360d08168b2ca0c96275432856a539737a105a6d6ae6bf7e88f"},"package":"f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"}
|
63
vendor/adler/CHANGELOG.md
vendored
Normal file
63
vendor/adler/CHANGELOG.md
vendored
Normal file
@ -0,0 +1,63 @@
|
||||
# Changelog
|
||||
|
||||
## Unreleased
|
||||
|
||||
No changes.
|
||||
|
||||
## [1.0.2 - 2021-02-26](https://github.com/jonas-schievink/adler/releases/tag/v1.0.2)
|
||||
|
||||
- Fix doctest on big-endian systems ([#9]).
|
||||
|
||||
[#9]: https://github.com/jonas-schievink/adler/pull/9
|
||||
|
||||
## [1.0.1 - 2020-11-08](https://github.com/jonas-schievink/adler/releases/tag/v1.0.1)
|
||||
|
||||
### Fixes
|
||||
|
||||
- Fix documentation on docs.rs.
|
||||
|
||||
## [1.0.0 - 2020-11-08](https://github.com/jonas-schievink/adler/releases/tag/v1.0.0)
|
||||
|
||||
### Fixes
|
||||
|
||||
- Fix `cargo test --no-default-features` ([#5]).
|
||||
|
||||
### Improvements
|
||||
|
||||
- Extended and clarified documentation.
|
||||
- Added more rustdoc examples.
|
||||
- Extended CI to test the crate with `--no-default-features`.
|
||||
|
||||
### Breaking Changes
|
||||
|
||||
- `adler32_reader` now takes its generic argument by value instead of as a `&mut`.
|
||||
- Renamed `adler32_reader` to `adler32`.
|
||||
|
||||
## [0.2.3 - 2020-07-11](https://github.com/jonas-schievink/adler/releases/tag/v0.2.3)
|
||||
|
||||
- Process 4 Bytes at a time, improving performance by up to 50% ([#2]).
|
||||
|
||||
## [0.2.2 - 2020-06-27](https://github.com/jonas-schievink/adler/releases/tag/v0.2.2)
|
||||
|
||||
- Bump MSRV to 1.31.0.
|
||||
|
||||
## [0.2.1 - 2020-06-27](https://github.com/jonas-schievink/adler/releases/tag/v0.2.1)
|
||||
|
||||
- Add a few `#[inline]` annotations to small functions.
|
||||
- Fix CI badge.
|
||||
- Allow integration into libstd.
|
||||
|
||||
## [0.2.0 - 2020-06-27](https://github.com/jonas-schievink/adler/releases/tag/v0.2.0)
|
||||
|
||||
- Support `#![no_std]` when using `default-features = false`.
|
||||
- Improve performance by around 7x.
|
||||
- Support Rust 1.8.0.
|
||||
- Improve API naming.
|
||||
|
||||
## [0.1.0 - 2020-06-26](https://github.com/jonas-schievink/adler/releases/tag/v0.1.0)
|
||||
|
||||
Initial release.
|
||||
|
||||
|
||||
[#2]: https://github.com/jonas-schievink/adler/pull/2
|
||||
[#5]: https://github.com/jonas-schievink/adler/pull/5
|
64
vendor/adler/Cargo.toml
vendored
Normal file
64
vendor/adler/Cargo.toml
vendored
Normal file
@ -0,0 +1,64 @@
|
||||
# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO
|
||||
#
|
||||
# When uploading crates to the registry Cargo will automatically
|
||||
# "normalize" Cargo.toml files for maximal compatibility
|
||||
# with all versions of Cargo and also rewrite `path` dependencies
|
||||
# to registry (e.g., crates.io) dependencies
|
||||
#
|
||||
# If you believe there's an error in this file please file an
|
||||
# issue against the rust-lang/cargo repository. If you're
|
||||
# editing this file be aware that the upstream Cargo.toml
|
||||
# will likely look very different (and much more reasonable)
|
||||
|
||||
[package]
|
||||
name = "adler"
|
||||
version = "1.0.2"
|
||||
authors = ["Jonas Schievink <jonasschievink@gmail.com>"]
|
||||
description = "A simple clean-room implementation of the Adler-32 checksum"
|
||||
documentation = "https://docs.rs/adler/"
|
||||
readme = "README.md"
|
||||
keywords = ["checksum", "integrity", "hash", "adler32", "zlib"]
|
||||
categories = ["algorithms"]
|
||||
license = "0BSD OR MIT OR Apache-2.0"
|
||||
repository = "https://github.com/jonas-schievink/adler.git"
|
||||
[package.metadata.docs.rs]
|
||||
rustdoc-args = ["--cfg=docsrs"]
|
||||
|
||||
[package.metadata.release]
|
||||
no-dev-version = true
|
||||
pre-release-commit-message = "Release {{version}}"
|
||||
tag-message = "{{version}}"
|
||||
|
||||
[[package.metadata.release.pre-release-replacements]]
|
||||
file = "CHANGELOG.md"
|
||||
replace = "## Unreleased\n\nNo changes.\n\n## [{{version}} - {{date}}](https://github.com/jonas-schievink/adler/releases/tag/v{{version}})\n"
|
||||
search = "## Unreleased\n"
|
||||
|
||||
[[package.metadata.release.pre-release-replacements]]
|
||||
file = "README.md"
|
||||
replace = "adler = \"{{version}}\""
|
||||
search = "adler = \"[a-z0-9\\\\.-]+\""
|
||||
|
||||
[[package.metadata.release.pre-release-replacements]]
|
||||
file = "src/lib.rs"
|
||||
replace = "https://docs.rs/adler/{{version}}"
|
||||
search = "https://docs.rs/adler/[a-z0-9\\.-]+"
|
||||
|
||||
[[bench]]
|
||||
name = "bench"
|
||||
harness = false
|
||||
[dependencies.compiler_builtins]
|
||||
version = "0.1.2"
|
||||
optional = true
|
||||
|
||||
[dependencies.core]
|
||||
version = "1.0.0"
|
||||
optional = true
|
||||
package = "rustc-std-workspace-core"
|
||||
[dev-dependencies.criterion]
|
||||
version = "0.3.2"
|
||||
|
||||
[features]
|
||||
default = ["std"]
|
||||
rustc-dep-of-std = ["core", "compiler_builtins"]
|
||||
std = []
|
12
vendor/adler/LICENSE-0BSD
vendored
Normal file
12
vendor/adler/LICENSE-0BSD
vendored
Normal file
@ -0,0 +1,12 @@
|
||||
Copyright (C) Jonas Schievink <jonasschievink@gmail.com>
|
||||
|
||||
Permission to use, copy, modify, and/or distribute this software for
|
||||
any purpose with or without fee is hereby granted.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
|
||||
WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
||||
MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
|
||||
ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
||||
WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
|
||||
AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
|
||||
OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
201
vendor/adler/LICENSE-APACHE
vendored
Normal file
201
vendor/adler/LICENSE-APACHE
vendored
Normal file
@ -0,0 +1,201 @@
|
||||
Apache License
|
||||
Version 2.0, January 2004
|
||||
https://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
1. Definitions.
|
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction,
|
||||
and distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by
|
||||
the copyright owner that is granting the License.
|
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all
|
||||
other entities that control, are controlled by, or are under common
|
||||
control with that entity. For the purposes of this definition,
|
||||
"control" means (i) the power, direct or indirect, to cause the
|
||||
direction or management of such entity, whether by contract or
|
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity
|
||||
exercising permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications,
|
||||
including but not limited to software source code, documentation
|
||||
source, and configuration files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical
|
||||
transformation or translation of a Source form, including but
|
||||
not limited to compiled object code, generated documentation,
|
||||
and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or
|
||||
Object form, made available under the License, as indicated by a
|
||||
copyright notice that is included in or attached to the work
|
||||
(an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object
|
||||
form, that is based on (or derived from) the Work and for which the
|
||||
editorial revisions, annotations, elaborations, or other modifications
|
||||
represent, as a whole, an original work of authorship. For the purposes
|
||||
of this License, Derivative Works shall not include works that remain
|
||||
separable from, or merely link (or bind by name) to the interfaces of,
|
||||
the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including
|
||||
the original version of the Work and any modifications or additions
|
||||
to that Work or Derivative Works thereof, that is intentionally
|
||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||
or by an individual or Legal Entity authorized to submit on behalf of
|
||||
the copyright owner. For the purposes of this definition, "submitted"
|
||||
means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems,
|
||||
and issue tracking systems that are managed by, or on behalf of, the
|
||||
Licensor for the purpose of discussing and improving the Work, but
|
||||
excluding communication that is conspicuously marked or otherwise
|
||||
designated in writing by the copyright owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||
on behalf of whom a Contribution has been received by Licensor and
|
||||
subsequently incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
copyright license to reproduce, prepare Derivative Works of,
|
||||
publicly display, publicly perform, sublicense, and distribute the
|
||||
Work and such Derivative Works in Source or Object form.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
(except as stated in this section) patent license to make, have made,
|
||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||
where such license applies only to those patent claims licensable
|
||||
by such Contributor that are necessarily infringed by their
|
||||
Contribution(s) alone or by combination of their Contribution(s)
|
||||
with the Work to which such Contribution(s) was submitted. If You
|
||||
institute patent litigation against any entity (including a
|
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||
or a Contribution incorporated within the Work constitutes direct
|
||||
or contributory patent infringement, then any patent licenses
|
||||
granted to You under this License for that Work shall terminate
|
||||
as of the date such litigation is filed.
|
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the
|
||||
Work or Derivative Works thereof in any medium, with or without
|
||||
modifications, and in Source or Object form, provided that You
|
||||
meet the following conditions:
|
||||
|
||||
(a) You must give any other recipients of the Work or
|
||||
Derivative Works a copy of this License; and
|
||||
|
||||
(b) You must cause any modified files to carry prominent notices
|
||||
stating that You changed the files; and
|
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works
|
||||
that You distribute, all copyright, patent, trademark, and
|
||||
attribution notices from the Source form of the Work,
|
||||
excluding those notices that do not pertain to any part of
|
||||
the Derivative Works; and
|
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its
|
||||
distribution, then any Derivative Works that You distribute must
|
||||
include a readable copy of the attribution notices contained
|
||||
within such NOTICE file, excluding those notices that do not
|
||||
pertain to any part of the Derivative Works, in at least one
|
||||
of the following places: within a NOTICE text file distributed
|
||||
as part of the Derivative Works; within the Source form or
|
||||
documentation, if provided along with the Derivative Works; or,
|
||||
within a display generated by the Derivative Works, if and
|
||||
wherever such third-party notices normally appear. The contents
|
||||
of the NOTICE file are for informational purposes only and
|
||||
do not modify the License. You may add Your own attribution
|
||||
notices within Derivative Works that You distribute, alongside
|
||||
or as an addendum to the NOTICE text from the Work, provided
|
||||
that such additional attribution notices cannot be construed
|
||||
as modifying the License.
|
||||
|
||||
You may add Your own copyright statement to Your modifications and
|
||||
may provide additional or different license terms and conditions
|
||||
for use, reproduction, or distribution of Your modifications, or
|
||||
for any such Derivative Works as a whole, provided Your use,
|
||||
reproduction, and distribution of the Work otherwise complies with
|
||||
the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||
any Contribution intentionally submitted for inclusion in the Work
|
||||
by You to the Licensor shall be under the terms and conditions of
|
||||
this License, without any additional terms or conditions.
|
||||
Notwithstanding the above, nothing herein shall supersede or modify
|
||||
the terms of any separate license agreement you may have executed
|
||||
with Licensor regarding such Contributions.
|
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade
|
||||
names, trademarks, service marks, or product names of the Licensor,
|
||||
except as required for reasonable and customary use in describing the
|
||||
origin of the Work and reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||
agreed to in writing, Licensor provides the Work (and each
|
||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
implied, including, without limitation, any warranties or conditions
|
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Work and assume any
|
||||
risks associated with Your exercise of permissions under this License.
|
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory,
|
||||
whether in tort (including negligence), contract, or otherwise,
|
||||
unless required by applicable law (such as deliberate and grossly
|
||||
negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special,
|
||||
incidental, or consequential damages of any character arising as a
|
||||
result of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill,
|
||||
work stoppage, computer failure or malfunction, or any and all
|
||||
other commercial damages or losses), even if such Contributor
|
||||
has been advised of the possibility of such damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing
|
||||
the Work or Derivative Works thereof, You may choose to offer,
|
||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||
or other liability obligations and/or rights consistent with this
|
||||
License. However, in accepting such obligations, You may act only
|
||||
on Your own behalf and on Your sole responsibility, not on behalf
|
||||
of any other Contributor, and only if You agree to indemnify,
|
||||
defend, and hold each Contributor harmless for any liability
|
||||
incurred by, or claims asserted against, such Contributor by reason
|
||||
of your accepting any such warranty or additional liability.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
APPENDIX: How to apply the Apache License to your work.
|
||||
|
||||
To apply the Apache License to your work, attach the following
|
||||
boilerplate notice, with the fields enclosed by brackets "[]"
|
||||
replaced with your own identifying information. (Don't include
|
||||
the brackets!) The text should be enclosed in the appropriate
|
||||
comment syntax for the file format. We also recommend that a
|
||||
file or class name and description of purpose be included on the
|
||||
same "printed page" as the copyright notice for easier
|
||||
identification within third-party archives.
|
||||
|
||||
Copyright [yyyy] [name of copyright owner]
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
https://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
23
vendor/adler/LICENSE-MIT
vendored
Normal file
23
vendor/adler/LICENSE-MIT
vendored
Normal file
@ -0,0 +1,23 @@
|
||||
Permission is hereby granted, free of charge, to any
|
||||
person obtaining a copy of this software and associated
|
||||
documentation files (the "Software"), to deal in the
|
||||
Software without restriction, including without
|
||||
limitation the rights to use, copy, modify, merge,
|
||||
publish, distribute, sublicense, and/or sell copies of
|
||||
the Software, and to permit persons to whom the Software
|
||||
is furnished to do so, subject to the following
|
||||
conditions:
|
||||
|
||||
The above copyright notice and this permission notice
|
||||
shall be included in all copies or substantial portions
|
||||
of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
|
||||
ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
|
||||
TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
|
||||
PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
|
||||
SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
||||
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
|
||||
IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
DEALINGS IN THE SOFTWARE.
|
39
vendor/adler/README.md
vendored
Normal file
39
vendor/adler/README.md
vendored
Normal file
@ -0,0 +1,39 @@
|
||||
# Adler-32 checksums for Rust
|
||||
|
||||
[](https://crates.io/crates/adler)
|
||||
[](https://docs.rs/adler/)
|
||||

|
||||
|
||||
This crate provides a simple implementation of the Adler-32 checksum, used in
|
||||
the zlib compression format.
|
||||
|
||||
Please refer to the [changelog](CHANGELOG.md) to see what changed in the last
|
||||
releases.
|
||||
|
||||
## Features
|
||||
|
||||
- Permissively licensed (0BSD) clean-room implementation.
|
||||
- Zero dependencies.
|
||||
- Zero `unsafe`.
|
||||
- Decent performance (3-4 GB/s).
|
||||
- Supports `#![no_std]` (with `default-features = false`).
|
||||
|
||||
## Usage
|
||||
|
||||
Add an entry to your `Cargo.toml`:
|
||||
|
||||
```toml
|
||||
[dependencies]
|
||||
adler = "1.0.2"
|
||||
```
|
||||
|
||||
Check the [API Documentation](https://docs.rs/adler/) for how to use the
|
||||
crate's functionality.
|
||||
|
||||
## Rust version support
|
||||
|
||||
Currently, this crate supports all Rust versions starting at Rust 1.31.0.
|
||||
|
||||
Bumping the Minimum Supported Rust Version (MSRV) is *not* considered a breaking
|
||||
change, but will not be done without good reasons. The latest 3 stable Rust
|
||||
versions will always be supported no matter what.
|
13
vendor/adler/RELEASE_PROCESS.md
vendored
Normal file
13
vendor/adler/RELEASE_PROCESS.md
vendored
Normal file
@ -0,0 +1,13 @@
|
||||
# What to do to publish a new release
|
||||
|
||||
1. Ensure all notable changes are in the changelog under "Unreleased".
|
||||
|
||||
2. Execute `cargo release <level>` to bump version(s), tag and publish
|
||||
everything. External subcommand, must be installed with `cargo install
|
||||
cargo-release`.
|
||||
|
||||
`<level>` can be one of `major|minor|patch`. If this is the first release
|
||||
(`0.1.0`), use `minor`, since the version starts out as `0.0.0`.
|
||||
|
||||
3. Go to the GitHub releases, edit the just-pushed tag. Copy the release notes
|
||||
from the changelog.
|
109
vendor/adler/benches/bench.rs
vendored
Normal file
109
vendor/adler/benches/bench.rs
vendored
Normal file
@ -0,0 +1,109 @@
|
||||
extern crate adler;
|
||||
extern crate criterion;
|
||||
|
||||
use adler::{adler32_slice, Adler32};
|
||||
use criterion::{criterion_group, criterion_main, Criterion, Throughput};
|
||||
|
||||
fn simple(c: &mut Criterion) {
|
||||
{
|
||||
const SIZE: usize = 100;
|
||||
|
||||
let mut group = c.benchmark_group("simple-100b");
|
||||
group.throughput(Throughput::Bytes(SIZE as u64));
|
||||
group.bench_function("zeroes-100", |bencher| {
|
||||
bencher.iter(|| {
|
||||
adler32_slice(&[0; SIZE]);
|
||||
});
|
||||
});
|
||||
group.bench_function("ones-100", |bencher| {
|
||||
bencher.iter(|| {
|
||||
adler32_slice(&[0xff; SIZE]);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
{
|
||||
const SIZE: usize = 1024;
|
||||
|
||||
let mut group = c.benchmark_group("simple-1k");
|
||||
group.throughput(Throughput::Bytes(SIZE as u64));
|
||||
|
||||
group.bench_function("zeroes-1k", |bencher| {
|
||||
bencher.iter(|| {
|
||||
adler32_slice(&[0; SIZE]);
|
||||
});
|
||||
});
|
||||
|
||||
group.bench_function("ones-1k", |bencher| {
|
||||
bencher.iter(|| {
|
||||
adler32_slice(&[0xff; SIZE]);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
{
|
||||
const SIZE: usize = 1024 * 1024;
|
||||
|
||||
let mut group = c.benchmark_group("simple-1m");
|
||||
group.throughput(Throughput::Bytes(SIZE as u64));
|
||||
group.bench_function("zeroes-1m", |bencher| {
|
||||
bencher.iter(|| {
|
||||
adler32_slice(&[0; SIZE]);
|
||||
});
|
||||
});
|
||||
|
||||
group.bench_function("ones-1m", |bencher| {
|
||||
bencher.iter(|| {
|
||||
adler32_slice(&[0xff; SIZE]);
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
fn chunked(c: &mut Criterion) {
|
||||
const SIZE: usize = 16 * 1024 * 1024;
|
||||
|
||||
let data = vec![0xAB; SIZE];
|
||||
|
||||
let mut group = c.benchmark_group("chunked-16m");
|
||||
group.throughput(Throughput::Bytes(SIZE as u64));
|
||||
group.bench_function("5552", |bencher| {
|
||||
bencher.iter(|| {
|
||||
let mut h = Adler32::new();
|
||||
for chunk in data.chunks(5552) {
|
||||
h.write_slice(chunk);
|
||||
}
|
||||
h.checksum()
|
||||
});
|
||||
});
|
||||
group.bench_function("8k", |bencher| {
|
||||
bencher.iter(|| {
|
||||
let mut h = Adler32::new();
|
||||
for chunk in data.chunks(8 * 1024) {
|
||||
h.write_slice(chunk);
|
||||
}
|
||||
h.checksum()
|
||||
});
|
||||
});
|
||||
group.bench_function("64k", |bencher| {
|
||||
bencher.iter(|| {
|
||||
let mut h = Adler32::new();
|
||||
for chunk in data.chunks(64 * 1024) {
|
||||
h.write_slice(chunk);
|
||||
}
|
||||
h.checksum()
|
||||
});
|
||||
});
|
||||
group.bench_function("1m", |bencher| {
|
||||
bencher.iter(|| {
|
||||
let mut h = Adler32::new();
|
||||
for chunk in data.chunks(1024 * 1024) {
|
||||
h.write_slice(chunk);
|
||||
}
|
||||
h.checksum()
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
criterion_group!(benches, simple, chunked);
|
||||
criterion_main!(benches);
|
146
vendor/adler/src/algo.rs
vendored
Normal file
146
vendor/adler/src/algo.rs
vendored
Normal file
@ -0,0 +1,146 @@
|
||||
use crate::Adler32;
|
||||
use std::ops::{AddAssign, MulAssign, RemAssign};
|
||||
|
||||
impl Adler32 {
|
||||
pub(crate) fn compute(&mut self, bytes: &[u8]) {
|
||||
// The basic algorithm is, for every byte:
|
||||
// a = (a + byte) % MOD
|
||||
// b = (b + a) % MOD
|
||||
// where MOD = 65521.
|
||||
//
|
||||
// For efficiency, we can defer the `% MOD` operations as long as neither a nor b overflows:
|
||||
// - Between calls to `write`, we ensure that a and b are always in range 0..MOD.
|
||||
// - We use 32-bit arithmetic in this function.
|
||||
// - Therefore, a and b must not increase by more than 2^32-MOD without performing a `% MOD`
|
||||
// operation.
|
||||
//
|
||||
// According to Wikipedia, b is calculated as follows for non-incremental checksumming:
|
||||
// b = n×D1 + (n−1)×D2 + (n−2)×D3 + ... + Dn + n*1 (mod 65521)
|
||||
// Where n is the number of bytes and Di is the i-th Byte. We need to change this to account
|
||||
// for the previous values of a and b, as well as treat every input Byte as being 255:
|
||||
// b_inc = n×255 + (n-1)×255 + ... + 255 + n*65520
|
||||
// Or in other words:
|
||||
// b_inc = n*65520 + n(n+1)/2*255
|
||||
// The max chunk size is thus the largest value of n so that b_inc <= 2^32-65521.
|
||||
// 2^32-65521 = n*65520 + n(n+1)/2*255
|
||||
// Plugging this into an equation solver since I can't math gives n = 5552.18..., so 5552.
|
||||
//
|
||||
// On top of the optimization outlined above, the algorithm can also be parallelized with a
|
||||
// bit more work:
|
||||
//
|
||||
// Note that b is a linear combination of a vector of input bytes (D1, ..., Dn).
|
||||
//
|
||||
// If we fix some value k<N and rewrite indices 1, ..., N as
|
||||
//
|
||||
// 1_1, 1_2, ..., 1_k, 2_1, ..., 2_k, ..., (N/k)_k,
|
||||
//
|
||||
// then we can express a and b in terms of sums of smaller sequences kb and ka:
|
||||
//
|
||||
// ka(j) := D1_j + D2_j + ... + D(N/k)_j where j <= k
|
||||
// kb(j) := (N/k)*D1_j + (N/k-1)*D2_j + ... + D(N/k)_j where j <= k
|
||||
//
|
||||
// a = ka(1) + ka(2) + ... + ka(k) + 1
|
||||
// b = k*(kb(1) + kb(2) + ... + kb(k)) - 1*ka(2) - ... - (k-1)*ka(k) + N
|
||||
//
|
||||
// We use this insight to unroll the main loop and process k=4 bytes at a time.
|
||||
// The resulting code is highly amenable to SIMD acceleration, although the immediate speedups
|
||||
// stem from increased pipeline parallelism rather than auto-vectorization.
|
||||
//
|
||||
// This technique is described in-depth (here:)[https://software.intel.com/content/www/us/\
|
||||
// en/develop/articles/fast-computation-of-fletcher-checksums.html]
|
||||
|
||||
const MOD: u32 = 65521;
|
||||
const CHUNK_SIZE: usize = 5552 * 4;
|
||||
|
||||
let mut a = u32::from(self.a);
|
||||
let mut b = u32::from(self.b);
|
||||
let mut a_vec = U32X4([0; 4]);
|
||||
let mut b_vec = a_vec;
|
||||
|
||||
let (bytes, remainder) = bytes.split_at(bytes.len() - bytes.len() % 4);
|
||||
|
||||
// iterate over 4 bytes at a time
|
||||
let chunk_iter = bytes.chunks_exact(CHUNK_SIZE);
|
||||
let remainder_chunk = chunk_iter.remainder();
|
||||
for chunk in chunk_iter {
|
||||
for byte_vec in chunk.chunks_exact(4) {
|
||||
let val = U32X4::from(byte_vec);
|
||||
a_vec += val;
|
||||
b_vec += a_vec;
|
||||
}
|
||||
b += CHUNK_SIZE as u32 * a;
|
||||
a_vec %= MOD;
|
||||
b_vec %= MOD;
|
||||
b %= MOD;
|
||||
}
|
||||
// special-case the final chunk because it may be shorter than the rest
|
||||
for byte_vec in remainder_chunk.chunks_exact(4) {
|
||||
let val = U32X4::from(byte_vec);
|
||||
a_vec += val;
|
||||
b_vec += a_vec;
|
||||
}
|
||||
b += remainder_chunk.len() as u32 * a;
|
||||
a_vec %= MOD;
|
||||
b_vec %= MOD;
|
||||
b %= MOD;
|
||||
|
||||
// combine the sub-sum results into the main sum
|
||||
b_vec *= 4;
|
||||
b_vec.0[1] += MOD - a_vec.0[1];
|
||||
b_vec.0[2] += (MOD - a_vec.0[2]) * 2;
|
||||
b_vec.0[3] += (MOD - a_vec.0[3]) * 3;
|
||||
for &av in a_vec.0.iter() {
|
||||
a += av;
|
||||
}
|
||||
for &bv in b_vec.0.iter() {
|
||||
b += bv;
|
||||
}
|
||||
|
||||
// iterate over the remaining few bytes in serial
|
||||
for &byte in remainder.iter() {
|
||||
a += u32::from(byte);
|
||||
b += a;
|
||||
}
|
||||
|
||||
self.a = (a % MOD) as u16;
|
||||
self.b = (b % MOD) as u16;
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone)]
|
||||
struct U32X4([u32; 4]);
|
||||
|
||||
impl U32X4 {
|
||||
fn from(bytes: &[u8]) -> Self {
|
||||
U32X4([
|
||||
u32::from(bytes[0]),
|
||||
u32::from(bytes[1]),
|
||||
u32::from(bytes[2]),
|
||||
u32::from(bytes[3]),
|
||||
])
|
||||
}
|
||||
}
|
||||
|
||||
impl AddAssign<Self> for U32X4 {
|
||||
fn add_assign(&mut self, other: Self) {
|
||||
for (s, o) in self.0.iter_mut().zip(other.0.iter()) {
|
||||
*s += o;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl RemAssign<u32> for U32X4 {
|
||||
fn rem_assign(&mut self, quotient: u32) {
|
||||
for s in self.0.iter_mut() {
|
||||
*s %= quotient;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl MulAssign<u32> for U32X4 {
|
||||
fn mul_assign(&mut self, rhs: u32) {
|
||||
for s in self.0.iter_mut() {
|
||||
*s *= rhs;
|
||||
}
|
||||
}
|
||||
}
|
287
vendor/adler/src/lib.rs
vendored
Normal file
287
vendor/adler/src/lib.rs
vendored
Normal file
@ -0,0 +1,287 @@
|
||||
//! Adler-32 checksum implementation.
|
||||
//!
|
||||
//! This implementation features:
|
||||
//!
|
||||
//! - Permissively licensed (0BSD) clean-room implementation.
|
||||
//! - Zero dependencies.
|
||||
//! - Zero `unsafe`.
|
||||
//! - Decent performance (3-4 GB/s).
|
||||
//! - `#![no_std]` support (with `default-features = false`).
|
||||
|
||||
#![doc(html_root_url = "https://docs.rs/adler/1.0.2")]
|
||||
// Deny a few warnings in doctests, since rustdoc `allow`s many warnings by default
|
||||
#![doc(test(attr(deny(unused_imports, unused_must_use))))]
|
||||
#![cfg_attr(docsrs, feature(doc_cfg))]
|
||||
#![warn(missing_debug_implementations)]
|
||||
#![forbid(unsafe_code)]
|
||||
#![cfg_attr(not(feature = "std"), no_std)]
|
||||
|
||||
#[cfg(not(feature = "std"))]
|
||||
extern crate core as std;
|
||||
|
||||
mod algo;
|
||||
|
||||
use std::hash::Hasher;
|
||||
|
||||
#[cfg(feature = "std")]
|
||||
use std::io::{self, BufRead};
|
||||
|
||||
/// Adler-32 checksum calculator.
|
||||
///
|
||||
/// An instance of this type is equivalent to an Adler-32 checksum: It can be created in the default
|
||||
/// state via [`new`] (or the provided `Default` impl), or from a precalculated checksum via
|
||||
/// [`from_checksum`], and the currently stored checksum can be fetched via [`checksum`].
|
||||
///
|
||||
/// This type also implements `Hasher`, which makes it easy to calculate Adler-32 checksums of any
|
||||
/// type that implements or derives `Hash`. This also allows using Adler-32 in a `HashMap`, although
|
||||
/// that is not recommended (while every checksum is a hash function, they are not necessarily a
|
||||
/// good one).
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// Basic, piecewise checksum calculation:
|
||||
///
|
||||
/// ```
|
||||
/// use adler::Adler32;
|
||||
///
|
||||
/// let mut adler = Adler32::new();
|
||||
///
|
||||
/// adler.write_slice(&[0, 1, 2]);
|
||||
/// adler.write_slice(&[3, 4, 5]);
|
||||
///
|
||||
/// assert_eq!(adler.checksum(), 0x00290010);
|
||||
/// ```
|
||||
///
|
||||
/// Using `Hash` to process structures:
|
||||
///
|
||||
/// ```
|
||||
/// use std::hash::Hash;
|
||||
/// use adler::Adler32;
|
||||
///
|
||||
/// #[derive(Hash)]
|
||||
/// struct Data {
|
||||
/// byte: u8,
|
||||
/// word: u16,
|
||||
/// big: u64,
|
||||
/// }
|
||||
///
|
||||
/// let mut adler = Adler32::new();
|
||||
///
|
||||
/// let data = Data { byte: 0x1F, word: 0xABCD, big: !0 };
|
||||
/// data.hash(&mut adler);
|
||||
///
|
||||
/// // hash value depends on architecture endianness
|
||||
/// if cfg!(target_endian = "little") {
|
||||
/// assert_eq!(adler.checksum(), 0x33410990);
|
||||
/// }
|
||||
/// if cfg!(target_endian = "big") {
|
||||
/// assert_eq!(adler.checksum(), 0x331F0990);
|
||||
/// }
|
||||
///
|
||||
/// ```
|
||||
///
|
||||
/// [`new`]: #method.new
|
||||
/// [`from_checksum`]: #method.from_checksum
|
||||
/// [`checksum`]: #method.checksum
|
||||
#[derive(Debug, Copy, Clone)]
|
||||
pub struct Adler32 {
|
||||
a: u16,
|
||||
b: u16,
|
||||
}
|
||||
|
||||
impl Adler32 {
|
||||
/// Creates a new Adler-32 instance with default state.
|
||||
#[inline]
|
||||
pub fn new() -> Self {
|
||||
Self::default()
|
||||
}
|
||||
|
||||
/// Creates an `Adler32` instance from a precomputed Adler-32 checksum.
|
||||
///
|
||||
/// This allows resuming checksum calculation without having to keep the `Adler32` instance
|
||||
/// around.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```
|
||||
/// # use adler::Adler32;
|
||||
/// let parts = [
|
||||
/// "rust",
|
||||
/// "acean",
|
||||
/// ];
|
||||
/// let whole = adler::adler32_slice(b"rustacean");
|
||||
///
|
||||
/// let mut sum = Adler32::new();
|
||||
/// sum.write_slice(parts[0].as_bytes());
|
||||
/// let partial = sum.checksum();
|
||||
///
|
||||
/// // ...later
|
||||
///
|
||||
/// let mut sum = Adler32::from_checksum(partial);
|
||||
/// sum.write_slice(parts[1].as_bytes());
|
||||
/// assert_eq!(sum.checksum(), whole);
|
||||
/// ```
|
||||
#[inline]
|
||||
pub fn from_checksum(sum: u32) -> Self {
|
||||
Adler32 {
|
||||
a: sum as u16,
|
||||
b: (sum >> 16) as u16,
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the calculated checksum at this point in time.
|
||||
#[inline]
|
||||
pub fn checksum(&self) -> u32 {
|
||||
(u32::from(self.b) << 16) | u32::from(self.a)
|
||||
}
|
||||
|
||||
/// Adds `bytes` to the checksum calculation.
|
||||
///
|
||||
/// If efficiency matters, this should be called with Byte slices that contain at least a few
|
||||
/// thousand Bytes.
|
||||
pub fn write_slice(&mut self, bytes: &[u8]) {
|
||||
self.compute(bytes);
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for Adler32 {
|
||||
#[inline]
|
||||
fn default() -> Self {
|
||||
Adler32 { a: 1, b: 0 }
|
||||
}
|
||||
}
|
||||
|
||||
impl Hasher for Adler32 {
|
||||
#[inline]
|
||||
fn finish(&self) -> u64 {
|
||||
u64::from(self.checksum())
|
||||
}
|
||||
|
||||
fn write(&mut self, bytes: &[u8]) {
|
||||
self.write_slice(bytes);
|
||||
}
|
||||
}
|
||||
|
||||
/// Calculates the Adler-32 checksum of a byte slice.
|
||||
///
|
||||
/// This is a convenience function around the [`Adler32`] type.
|
||||
///
|
||||
/// [`Adler32`]: struct.Adler32.html
|
||||
pub fn adler32_slice(data: &[u8]) -> u32 {
|
||||
let mut h = Adler32::new();
|
||||
h.write_slice(data);
|
||||
h.checksum()
|
||||
}
|
||||
|
||||
/// Calculates the Adler-32 checksum of a `BufRead`'s contents.
|
||||
///
|
||||
/// The passed `BufRead` implementor will be read until it reaches EOF (or until it reports an
|
||||
/// error).
|
||||
///
|
||||
/// If you only have a `Read` implementor, you can wrap it in `std::io::BufReader` before calling
|
||||
/// this function.
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// Any error returned by the reader are bubbled up by this function.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// ```no_run
|
||||
/// # fn run() -> Result<(), Box<dyn std::error::Error>> {
|
||||
/// use adler::adler32;
|
||||
///
|
||||
/// use std::fs::File;
|
||||
/// use std::io::BufReader;
|
||||
///
|
||||
/// let file = File::open("input.txt")?;
|
||||
/// let mut file = BufReader::new(file);
|
||||
///
|
||||
/// adler32(&mut file)?;
|
||||
/// # Ok(()) }
|
||||
/// # fn main() { run().unwrap() }
|
||||
/// ```
|
||||
#[cfg(feature = "std")]
|
||||
#[cfg_attr(docsrs, doc(cfg(feature = "std")))]
|
||||
pub fn adler32<R: BufRead>(mut reader: R) -> io::Result<u32> {
|
||||
let mut h = Adler32::new();
|
||||
loop {
|
||||
let len = {
|
||||
let buf = reader.fill_buf()?;
|
||||
if buf.is_empty() {
|
||||
return Ok(h.checksum());
|
||||
}
|
||||
|
||||
h.write_slice(buf);
|
||||
buf.len()
|
||||
};
|
||||
reader.consume(len);
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn zeroes() {
|
||||
assert_eq!(adler32_slice(&[]), 1);
|
||||
assert_eq!(adler32_slice(&[0]), 1 | 1 << 16);
|
||||
assert_eq!(adler32_slice(&[0, 0]), 1 | 2 << 16);
|
||||
assert_eq!(adler32_slice(&[0; 100]), 0x00640001);
|
||||
assert_eq!(adler32_slice(&[0; 1024]), 0x04000001);
|
||||
assert_eq!(adler32_slice(&[0; 1024 * 1024]), 0x00f00001);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn ones() {
|
||||
assert_eq!(adler32_slice(&[0xff; 1024]), 0x79a6fc2e);
|
||||
assert_eq!(adler32_slice(&[0xff; 1024 * 1024]), 0x8e88ef11);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn mixed() {
|
||||
assert_eq!(adler32_slice(&[1]), 2 | 2 << 16);
|
||||
assert_eq!(adler32_slice(&[40]), 41 | 41 << 16);
|
||||
|
||||
assert_eq!(adler32_slice(&[0xA5; 1024 * 1024]), 0xd5009ab1);
|
||||
}
|
||||
|
||||
/// Example calculation from https://en.wikipedia.org/wiki/Adler-32.
|
||||
#[test]
|
||||
fn wiki() {
|
||||
assert_eq!(adler32_slice(b"Wikipedia"), 0x11E60398);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn resume() {
|
||||
let mut adler = Adler32::new();
|
||||
adler.write_slice(&[0xff; 1024]);
|
||||
let partial = adler.checksum();
|
||||
assert_eq!(partial, 0x79a6fc2e); // from above
|
||||
adler.write_slice(&[0xff; 1024 * 1024 - 1024]);
|
||||
assert_eq!(adler.checksum(), 0x8e88ef11); // from above
|
||||
|
||||
// Make sure that we can resume computing from the partial checksum via `from_checksum`.
|
||||
let mut adler = Adler32::from_checksum(partial);
|
||||
adler.write_slice(&[0xff; 1024 * 1024 - 1024]);
|
||||
assert_eq!(adler.checksum(), 0x8e88ef11); // from above
|
||||
}
|
||||
|
||||
#[cfg(feature = "std")]
|
||||
#[test]
|
||||
fn bufread() {
|
||||
use std::io::BufReader;
|
||||
fn test(data: &[u8], checksum: u32) {
|
||||
// `BufReader` uses an 8 KB buffer, so this will test buffer refilling.
|
||||
let mut buf = BufReader::new(data);
|
||||
let real_sum = adler32(&mut buf).unwrap();
|
||||
assert_eq!(checksum, real_sum);
|
||||
}
|
||||
|
||||
test(&[], 1);
|
||||
test(&[0; 1024], 0x04000001);
|
||||
test(&[0; 1024 * 1024], 0x00f00001);
|
||||
test(&[0xA5; 1024 * 1024], 0xd5009ab1);
|
||||
}
|
||||
}
|
1
vendor/aho-corasick/.cargo-checksum.json
vendored
Normal file
1
vendor/aho-corasick/.cargo-checksum.json
vendored
Normal file
@ -0,0 +1 @@
|
||||
{"files":{"COPYING":"01c266bced4a434da0051174d6bee16a4c82cf634e2679b6155d40d75012390f","Cargo.toml":"747d0fcb1257c9b8b013104da3c5a67f5d6cf8a95a2163b13703c01cab2c010a","DESIGN.md":"59c960e1b73b1d7fb41e4df6c0c1b1fcf44dd2ebc8a349597a7d0595f8cb5130","LICENSE-MIT":"0f96a83840e146e43c0ec96a22ec1f392e0680e6c1226e6f3ba87e0740af850f","README.md":"afc4d559a98cf190029af0bf320fc0022725e349cd2a303aac860254e28f3c53","UNLICENSE":"7e12e5df4bae12cb21581ba157ced20e1986a0508dd10d0e8a4ab9a4cf94e85c","rustfmt.toml":"1ca600239a27401c4a43f363cf3f38183a212affc1f31bff3ae93234bbaec228","src/ahocorasick.rs":"c699c07df70be45c666e128509ad571a7649d2073e4ae16ac1efd6793c9c6890","src/automaton.rs":"22258a3e118672413119f8f543a9b912cce954e63524575c0ebfdf9011f9c2dd","src/dfa.rs":"bfef1a94c5e7410584b1beb4e857b40d1ae2031b881cbc06fb1300409bbd555f","src/lib.rs":"2a92d5c5e930f2d306508802e8a929135e1f41c9f5f8deda8f7eb98947179dd2","src/macros.rs":"c6c52ae05b24433cffaca7b78b3645d797862c5d5feffddf9f54909095ed6e05","src/nfa/contiguous.rs":"aeb6ee5fd80eea04decbc4b46aa27d1ab270b78d416a644da25b7934f009ee66","src/nfa/mod.rs":"ee7b3109774d14bbad5239c16bb980dd6b8185ec136d94fbaf2f0dc27d5ffa15","src/nfa/noncontiguous.rs":"de94f02b04efd8744fb096759a8897c22012b0e0ca3ace161fd87c71befefe04","src/packed/api.rs":"160d3b10823316f7b0924e13c3afd222c8a7db5c0a00432401f311ef27d6a1b7","src/packed/ext.rs":"66be06fde8558429da23a290584d4b9fae665bf64c2578db4fe5f5f3ee864869","src/packed/mod.rs":"0020cd6f07ba5c8955923a9516d7f758864260eda53a6b6f629131c45ddeec62","src/packed/pattern.rs":"1e3a289a730c141fc30b295811e372d046c6619c7fd670308299b889a06c7673","src/packed/rabinkarp.rs":"403146eb1d838a84601d171393542340513cd1ee7ff750f2372161dd47746586","src/packed/teddy/README.md":"3a43194b64e221543d885176aba3beb1224a927385a20eca842daf6b0ea2f342","src/packed/teddy/builder.rs":"720735ea6c7ff92b081426513e6e82feed24a922849297bb538d28f7b8129f81","src/packed/teddy/generic.rs":"ea252ab05b32cea7dd9d71e332071d243db7dd0362e049252a27e5881ba2bf39","src/packed/teddy/mod.rs":"17d741f7e2fb9dbac5ba7d1bd4542cf1e35e9f146ace728e23fe6bbed20028b2","src/packed/tests.rs":"8e2f56eb3890ed3876ecb47d3121996e416563127b6430110d7b516df3f83b4b","src/packed/vector.rs":"840065521cbd4701fa5b8b506d1537843d858c903f7cadf3c68749ea1780874b","src/tests.rs":"c68192ab97b6161d0d6ee96fefd80cc7d14e4486ddcd8d1f82b5c92432c24ed5","src/transducer.rs":"02daa33a5d6dac41dcfd67f51df7c0d4a91c5131c781fb54c4de3520c585a6e1","src/util/alphabet.rs":"6dc22658a38deddc0279892035b18870d4585069e35ba7c7e649a24509acfbcc","src/util/buffer.rs":"f9e37f662c46c6ecd734458dedbe76c3bb0e84a93b6b0117c0d4ad3042413891","src/util/byte_frequencies.rs":"2fb85b381c038c1e44ce94294531cdcd339dca48b1e61f41455666e802cbbc9e","src/util/debug.rs":"ab301ad59aa912529cb97233a54a05914dd3cb2ec43e6fec7334170b97ac5998","src/util/error.rs":"ecccd60e7406305023efcc6adcc826eeeb083ab8f7fbfe3d97469438cd4c4e5c","src/util/int.rs":"4ab6dbdba10027ddec2af63a9b28ce4eee30ded0daa5d8eb068b2b55542b6039","src/util/mod.rs":"7ab28d11323ecdbd982087f32eb8bceeee84f1a2583f3aae27039c36d58cf12c","src/util/prefilter.rs":"9fa4498f18bf70478b1996c1a013698b626d15f119aa81dbc536673c9f045718","src/util/primitives.rs":"f89f3fa1d8db4e37de9ca767c6d05e346404837cade6d063bba68972fafa610b","src/util/remapper.rs":"9f12d911583a325c11806eeceb46d0dfec863cfcfa241aed84d31af73da746e5","src/util/search.rs":"6af803e08b8b8c8a33db100623f1621b0d741616524ce40893d8316897f27ffe","src/util/special.rs":"7d2f9cb9dd9771f59816e829b2d96b1239996f32939ba98764e121696c52b146"},"package":"b2969dcb958b36655471fc61f7e416fa76033bdd4bfed0678d8fee1e2d07a1f0"}
|
3
vendor/aho-corasick/COPYING
vendored
Normal file
3
vendor/aho-corasick/COPYING
vendored
Normal file
@ -0,0 +1,3 @@
|
||||
This project is dual-licensed under the Unlicense and MIT licenses.
|
||||
|
||||
You may use this code under the terms of either license.
|
74
vendor/aho-corasick/Cargo.toml
vendored
Normal file
74
vendor/aho-corasick/Cargo.toml
vendored
Normal file
@ -0,0 +1,74 @@
|
||||
# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO
|
||||
#
|
||||
# When uploading crates to the registry Cargo will automatically
|
||||
# "normalize" Cargo.toml files for maximal compatibility
|
||||
# with all versions of Cargo and also rewrite `path` dependencies
|
||||
# to registry (e.g., crates.io) dependencies.
|
||||
#
|
||||
# If you are reading this file be aware that the original Cargo.toml
|
||||
# will likely look very different (and much more reasonable).
|
||||
# See Cargo.toml.orig for the original contents.
|
||||
|
||||
[package]
|
||||
edition = "2021"
|
||||
rust-version = "1.60.0"
|
||||
name = "aho-corasick"
|
||||
version = "1.1.2"
|
||||
authors = ["Andrew Gallant <jamslam@gmail.com>"]
|
||||
exclude = [
|
||||
"/aho-corasick-debug",
|
||||
"/benchmarks",
|
||||
"/tmp",
|
||||
]
|
||||
autotests = false
|
||||
description = "Fast multiple substring searching."
|
||||
homepage = "https://github.com/BurntSushi/aho-corasick"
|
||||
readme = "README.md"
|
||||
keywords = [
|
||||
"string",
|
||||
"search",
|
||||
"text",
|
||||
"pattern",
|
||||
"multi",
|
||||
]
|
||||
categories = ["text-processing"]
|
||||
license = "Unlicense OR MIT"
|
||||
repository = "https://github.com/BurntSushi/aho-corasick"
|
||||
|
||||
[package.metadata.docs.rs]
|
||||
all-features = true
|
||||
rustdoc-args = [
|
||||
"--cfg",
|
||||
"docsrs",
|
||||
"--generate-link-to-definition",
|
||||
]
|
||||
|
||||
[profile.bench]
|
||||
debug = 2
|
||||
|
||||
[profile.release]
|
||||
debug = 2
|
||||
|
||||
[lib]
|
||||
name = "aho_corasick"
|
||||
|
||||
[dependencies.log]
|
||||
version = "0.4.17"
|
||||
optional = true
|
||||
|
||||
[dependencies.memchr]
|
||||
version = "2.4.0"
|
||||
optional = true
|
||||
default-features = false
|
||||
|
||||
[dev-dependencies.doc-comment]
|
||||
version = "0.3.3"
|
||||
|
||||
[features]
|
||||
default = [
|
||||
"std",
|
||||
"perf-literal",
|
||||
]
|
||||
logging = ["dep:log"]
|
||||
perf-literal = ["dep:memchr"]
|
||||
std = ["memchr?/std"]
|
481
vendor/aho-corasick/DESIGN.md
vendored
Normal file
481
vendor/aho-corasick/DESIGN.md
vendored
Normal file
@ -0,0 +1,481 @@
|
||||
This document describes the internal design of this crate, which is an object
|
||||
lesson in what happens when you take a fairly simple old algorithm like
|
||||
Aho-Corasick and make it fast and production ready.
|
||||
|
||||
The target audience of this document is Rust programmers that have some
|
||||
familiarity with string searching, however, one does not need to know the
|
||||
Aho-Corasick algorithm in order to read this (it is explained below). One
|
||||
should, however, know what a trie is. (If you don't, go read its Wikipedia
|
||||
article.)
|
||||
|
||||
The center-piece of this crate is an implementation of Aho-Corasick. On its
|
||||
own, Aho-Corasick isn't that complicated. The complex pieces come from the
|
||||
different variants of Aho-Corasick implemented in this crate. Specifically,
|
||||
they are:
|
||||
|
||||
* Aho-Corasick as a noncontiguous NFA. States have their transitions
|
||||
represented sparsely, and each state puts its transitions in its own separate
|
||||
allocation. Hence the same "noncontiguous."
|
||||
* Aho-Corasick as a contiguous NFA. This NFA uses a single allocation to
|
||||
represent the transitions of all states. That is, transitions are laid out
|
||||
contiguously in memory. Moreover, states near the starting state are
|
||||
represented densely, such that finding the next state ID takes a constant
|
||||
number of instructions.
|
||||
* Aho-Corasick as a DFA. In this case, all states are represented densely in
|
||||
a transition table that uses one allocation.
|
||||
* Supporting "standard" match semantics, along with its overlapping variant,
|
||||
in addition to leftmost-first and leftmost-longest semantics. The "standard"
|
||||
semantics are typically what you see in a textbook description of
|
||||
Aho-Corasick. However, Aho-Corasick is also useful as an optimization in
|
||||
regex engines, which often use leftmost-first or leftmost-longest semantics.
|
||||
Thus, it is useful to implement those semantics here. The "standard" and
|
||||
"leftmost" search algorithms are subtly different, and also require slightly
|
||||
different construction algorithms.
|
||||
* Support for ASCII case insensitive matching.
|
||||
* Support for accelerating searches when the patterns all start with a small
|
||||
number of fixed bytes. Or alternatively, when the patterns all contain a
|
||||
small number of rare bytes. (Searching for these bytes uses SIMD vectorized
|
||||
code courtesy of `memchr`.)
|
||||
* Transparent support for alternative SIMD vectorized search routines for
|
||||
smaller number of literals, such as the Teddy algorithm. We called these
|
||||
"packed" search routines because they use SIMD. They can often be an order of
|
||||
magnitude faster than just Aho-Corasick, but don't scale as well.
|
||||
* Support for searching streams. This can reuse most of the underlying code,
|
||||
but does require careful buffering support.
|
||||
* Support for anchored searches, which permit efficient "is prefix" checks for
|
||||
a large number of patterns.
|
||||
|
||||
When you combine all of this together along with trying to make everything as
|
||||
fast as possible, what you end up with is enitrely too much code with too much
|
||||
`unsafe`. Alas, I was not smart enough to figure out how to reduce it. Instead,
|
||||
we will explain it.
|
||||
|
||||
|
||||
# Basics
|
||||
|
||||
The fundamental problem this crate is trying to solve is to determine the
|
||||
occurrences of possibly many patterns in a haystack. The naive way to solve
|
||||
this is to look for a match for each pattern at each position in the haystack:
|
||||
|
||||
for i in 0..haystack.len():
|
||||
for p in patterns.iter():
|
||||
if haystack[i..].starts_with(p.bytes()):
|
||||
return Match(p.id(), i, i + p.bytes().len())
|
||||
|
||||
Those four lines are effectively all this crate does. The problem with those
|
||||
four lines is that they are very slow, especially when you're searching for a
|
||||
large number of patterns.
|
||||
|
||||
While there are many different algorithms available to solve this, a popular
|
||||
one is Aho-Corasick. It's a common solution because it's not too hard to
|
||||
implement, scales quite well even when searching for thousands of patterns and
|
||||
is generally pretty fast. Aho-Corasick does well here because, regardless of
|
||||
the number of patterns you're searching for, it always visits each byte in the
|
||||
haystack exactly once. This means, generally speaking, adding more patterns to
|
||||
an Aho-Corasick automaton does not make it slower. (Strictly speaking, however,
|
||||
this is not true, since a larger automaton will make less effective use of the
|
||||
CPU's cache.)
|
||||
|
||||
Aho-Corasick can be succinctly described as a trie with state transitions
|
||||
between some of the nodes that efficiently instruct the search algorithm to
|
||||
try matching alternative keys in the trie. The trick is that these state
|
||||
transitions are arranged such that each byte of input needs to be inspected
|
||||
only once. These state transitions are typically called "failure transitions,"
|
||||
because they instruct the searcher (the thing traversing the automaton while
|
||||
reading from the haystack) what to do when a byte in the haystack does not
|
||||
correspond to a valid transition in the current state of the trie.
|
||||
|
||||
More formally, a failure transition points to a state in the automaton that may
|
||||
lead to a match whose prefix is a proper suffix of the path traversed through
|
||||
the trie so far. (If no such proper suffix exists, then the failure transition
|
||||
points back to the start state of the trie, effectively restarting the search.)
|
||||
This is perhaps simpler to explain pictorally. For example, let's say we built
|
||||
an Aho-Corasick automaton with the following patterns: 'abcd' and 'cef'. The
|
||||
trie looks like this:
|
||||
|
||||
a - S1 - b - S2 - c - S3 - d - S4*
|
||||
/
|
||||
S0 - c - S5 - e - S6 - f - S7*
|
||||
|
||||
where states marked with a `*` are match states (meaning, the search algorithm
|
||||
should stop and report a match to the caller).
|
||||
|
||||
So given this trie, it should be somewhat straight-forward to see how it can
|
||||
be used to determine whether any particular haystack *starts* with either
|
||||
`abcd` or `cef`. It's easy to express this in code:
|
||||
|
||||
fn has_prefix(trie: &Trie, haystack: &[u8]) -> bool {
|
||||
let mut state_id = trie.start();
|
||||
// If the empty pattern is in trie, then state_id is a match state.
|
||||
if trie.is_match(state_id) {
|
||||
return true;
|
||||
}
|
||||
for (i, &b) in haystack.iter().enumerate() {
|
||||
state_id = match trie.next_state(state_id, b) {
|
||||
Some(id) => id,
|
||||
// If there was no transition for this state and byte, then we know
|
||||
// the haystack does not start with one of the patterns in our trie.
|
||||
None => return false,
|
||||
};
|
||||
if trie.is_match(state_id) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
false
|
||||
}
|
||||
|
||||
And that's pretty much it. All we do is move through the trie starting with the
|
||||
bytes at the beginning of the haystack. If we find ourselves in a position
|
||||
where we can't move, or if we've looked through the entire haystack without
|
||||
seeing a match state, then we know the haystack does not start with any of the
|
||||
patterns in the trie.
|
||||
|
||||
The meat of the Aho-Corasick algorithm is in how we add failure transitions to
|
||||
our trie to keep searching efficient. Specifically, it permits us to not only
|
||||
check whether a haystack *starts* with any one of a number of patterns, but
|
||||
rather, whether the haystack contains any of a number of patterns *anywhere* in
|
||||
the haystack.
|
||||
|
||||
As mentioned before, failure transitions connect a proper suffix of the path
|
||||
traversed through the trie before, with a path that leads to a match that has a
|
||||
prefix corresponding to that proper suffix. So in our case, for patterns `abcd`
|
||||
and `cef`, with a haystack `abcef`, we want to transition to state `S5` (from
|
||||
the diagram above) from `S3` upon seeing that the byte following `c` is not
|
||||
`d`. Namely, the proper suffix in this example is `c`, which is a prefix of
|
||||
`cef`. So the modified diagram looks like this:
|
||||
|
||||
|
||||
a - S1 - b - S2 - c - S3 - d - S4*
|
||||
/ /
|
||||
/ ----------------
|
||||
/ /
|
||||
S0 - c - S5 - e - S6 - f - S7*
|
||||
|
||||
One thing that isn't shown in this diagram is that *all* states have a failure
|
||||
transition, but only `S3` has a *non-trivial* failure transition. That is, all
|
||||
other states have a failure transition back to the start state. So if our
|
||||
haystack was `abzabcd`, then the searcher would transition back to `S0` after
|
||||
seeing `z`, which effectively restarts the search. (Because there is no pattern
|
||||
in our trie that has a prefix of `bz` or `z`.)
|
||||
|
||||
The code for traversing this *automaton* or *finite state machine* (it is no
|
||||
longer just a trie) is not that much different from the `has_prefix` code
|
||||
above:
|
||||
|
||||
fn contains(fsm: &FiniteStateMachine, haystack: &[u8]) -> bool {
|
||||
let mut state_id = fsm.start();
|
||||
// If the empty pattern is in fsm, then state_id is a match state.
|
||||
if fsm.is_match(state_id) {
|
||||
return true;
|
||||
}
|
||||
for (i, &b) in haystack.iter().enumerate() {
|
||||
// While the diagram above doesn't show this, we may wind up needing
|
||||
// to follow multiple failure transitions before we land on a state
|
||||
// in which we can advance. Therefore, when searching for the next
|
||||
// state, we need to loop until we don't see a failure transition.
|
||||
//
|
||||
// This loop terminates because the start state has no empty
|
||||
// transitions. Every transition from the start state either points to
|
||||
// another state, or loops back to the start state.
|
||||
loop {
|
||||
match fsm.next_state(state_id, b) {
|
||||
Some(id) => {
|
||||
state_id = id;
|
||||
break;
|
||||
}
|
||||
// Unlike our code above, if there was no transition for this
|
||||
// state, then we don't quit. Instead, we look for this state's
|
||||
// failure transition and follow that instead.
|
||||
None => {
|
||||
state_id = fsm.next_fail_state(state_id);
|
||||
}
|
||||
};
|
||||
}
|
||||
if fsm.is_match(state_id) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
false
|
||||
}
|
||||
|
||||
Other than the complication around traversing failure transitions, this code
|
||||
is still roughly "traverse the automaton with bytes from the haystack, and quit
|
||||
when a match is seen."
|
||||
|
||||
And that concludes our section on the basics. While we didn't go deep into how
|
||||
the automaton is built (see `src/nfa/noncontiguous.rs`, which has detailed
|
||||
comments about that), the basic structure of Aho-Corasick should be reasonably
|
||||
clear.
|
||||
|
||||
|
||||
# NFAs and DFAs
|
||||
|
||||
There are generally two types of finite automata: non-deterministic finite
|
||||
automata (NFA) and deterministic finite automata (DFA). The difference between
|
||||
them is, principally, that an NFA can be in multiple states at once. This is
|
||||
typically accomplished by things called _epsilon_ transitions, where one could
|
||||
move to a new state without consuming any bytes from the input. (The other
|
||||
mechanism by which NFAs can be in more than one state is where the same byte in
|
||||
a particular state transitions to multiple distinct states.) In contrast, a DFA
|
||||
can only ever be in one state at a time. A DFA has no epsilon transitions, and
|
||||
for any given state, a byte transitions to at most one other state.
|
||||
|
||||
By this formulation, the Aho-Corasick automaton described in the previous
|
||||
section is an NFA. This is because failure transitions are, effectively,
|
||||
epsilon transitions. That is, whenever the automaton is in state `S`, it is
|
||||
actually in the set of states that are reachable by recursively following
|
||||
failure transitions from `S` until you reach the start state. (This means
|
||||
that, for example, the start state is always active since the start state is
|
||||
reachable via failure transitions from any state in the automaton.)
|
||||
|
||||
NFAs have a lot of nice properties. They tend to be easier to construct, and
|
||||
also tend to use less memory. However, their primary downside is that they are
|
||||
typically slower to execute a search with. For example, the code above showing
|
||||
how to search with an Aho-Corasick automaton needs to potentially iterate
|
||||
through many failure transitions for every byte of input. While this is a
|
||||
fairly small amount of overhead, this can add up, especially if the automaton
|
||||
has a lot of overlapping patterns with a lot of failure transitions.
|
||||
|
||||
A DFA's search code, by contrast, looks like this:
|
||||
|
||||
fn contains(dfa: &DFA, haystack: &[u8]) -> bool {
|
||||
let mut state_id = dfa.start();
|
||||
// If the empty pattern is in dfa, then state_id is a match state.
|
||||
if dfa.is_match(state_id) {
|
||||
return true;
|
||||
}
|
||||
for (i, &b) in haystack.iter().enumerate() {
|
||||
// An Aho-Corasick DFA *never* has a missing state that requires
|
||||
// failure transitions to be followed. One byte of input advances the
|
||||
// automaton by one state. Always.
|
||||
state_id = dfa.next_state(state_id, b);
|
||||
if dfa.is_match(state_id) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
false
|
||||
}
|
||||
|
||||
The search logic here is much simpler than for the NFA, and this tends to
|
||||
translate into significant performance benefits as well, since there's a lot
|
||||
less work being done for each byte in the haystack. How is this accomplished?
|
||||
It's done by pre-following all failure transitions for all states for all bytes
|
||||
in the alphabet, and then building a single state transition table. Building
|
||||
this DFA can be much more costly than building the NFA, and use much more
|
||||
memory, but the better performance can be worth it.
|
||||
|
||||
Users of this crate can actually choose between using one of two possible NFAs
|
||||
(noncontiguous or contiguous) or a DFA. By default, a contiguous NFA is used,
|
||||
in most circumstances, but if the number of patterns is small enough a DFA will
|
||||
be used. A contiguous NFA is chosen because it uses orders of magnitude less
|
||||
memory than a DFA, takes only a little longer to build than a noncontiguous
|
||||
NFA and usually gets pretty close to the search speed of a DFA. (Callers can
|
||||
override this automatic selection via the `AhoCorasickBuilder::start_kind`
|
||||
configuration.)
|
||||
|
||||
|
||||
# More DFA tricks
|
||||
|
||||
As described in the previous section, one of the downsides of using a DFA
|
||||
is that it uses more memory and can take longer to build. One small way of
|
||||
mitigating these concerns is to map the alphabet used by the automaton into
|
||||
a smaller space. Typically, the alphabet of a DFA has 256 elements in it:
|
||||
one element for each possible value that fits into a byte. However, in many
|
||||
cases, one does not need the full alphabet. For example, if all patterns in an
|
||||
Aho-Corasick automaton are ASCII letters, then this only uses up 52 distinct
|
||||
bytes. As far as the automaton is concerned, the rest of the 204 bytes are
|
||||
indistinguishable from one another: they will never disrciminate between a
|
||||
match or a non-match. Therefore, in cases like that, the alphabet can be shrunk
|
||||
to just 53 elements. One for each ASCII letter, and then another to serve as a
|
||||
placeholder for every other unused byte.
|
||||
|
||||
In practice, this library doesn't quite compute the optimal set of equivalence
|
||||
classes, but it's close enough in most cases. The key idea is that this then
|
||||
allows the transition table for the DFA to be potentially much smaller. The
|
||||
downside of doing this, however, is that since the transition table is defined
|
||||
in terms of this smaller alphabet space, every byte in the haystack must be
|
||||
re-mapped to this smaller space. This requires an additional 256-byte table.
|
||||
In practice, this can lead to a small search time hit, but it can be difficult
|
||||
to measure. Moreover, it can sometimes lead to faster search times for bigger
|
||||
automata, since it could be difference between more parts of the automaton
|
||||
staying in the CPU cache or not.
|
||||
|
||||
One other trick for DFAs employed by this crate is the notion of premultiplying
|
||||
state identifiers. Specifically, the normal way to compute the next transition
|
||||
in a DFA is via the following (assuming that the transition table is laid out
|
||||
sequentially in memory, in row-major order, where the rows are states):
|
||||
|
||||
next_state_id = dfa.transitions[current_state_id * 256 + current_byte]
|
||||
|
||||
However, since the value `256` is a fixed constant, we can actually premultiply
|
||||
the state identifiers in the table when we build the table initially. Then, the
|
||||
next transition computation simply becomes:
|
||||
|
||||
next_state_id = dfa.transitions[current_state_id + current_byte]
|
||||
|
||||
This doesn't seem like much, but when this is being executed for every byte of
|
||||
input that you're searching, saving that extra multiplication instruction can
|
||||
add up.
|
||||
|
||||
The same optimization works even when equivalence classes are enabled, as
|
||||
described above. The only difference is that the premultiplication is by the
|
||||
total number of equivalence classes instead of 256.
|
||||
|
||||
There isn't much downside to premultiplying state identifiers, other than it
|
||||
imposes a smaller limit on the total number of states in the DFA. Namely, with
|
||||
premultiplied state identifiers, you run out of room in your state identifier
|
||||
representation more rapidly than if the identifiers are just state indices.
|
||||
|
||||
Both equivalence classes and premultiplication are always enabled. There is a
|
||||
`AhoCorasickBuilder::byte_classes` configuration, but disabling this just makes
|
||||
it so there are always 256 equivalence classes, i.e., every class corresponds
|
||||
to precisely one byte. When it's disabled, the equivalence class map itself is
|
||||
still used. The purpose of disabling it is when one is debugging the underlying
|
||||
automaton. It can be easier to comprehend when it uses actual byte values for
|
||||
its transitions instead of equivalence classes.
|
||||
|
||||
|
||||
# Match semantics
|
||||
|
||||
One of the more interesting things about this implementation of Aho-Corasick
|
||||
that (as far as this author knows) separates it from other implementations, is
|
||||
that it natively supports leftmost-first and leftmost-longest match semantics.
|
||||
Briefly, match semantics refer to the decision procedure by which searching
|
||||
will disambiguate matches when there are multiple to choose from:
|
||||
|
||||
* **standard** match semantics emits matches as soon as they are detected by
|
||||
the automaton. This is typically equivalent to the textbook non-overlapping
|
||||
formulation of Aho-Corasick.
|
||||
* **leftmost-first** match semantics means that 1) the next match is the match
|
||||
starting at the leftmost position and 2) among multiple matches starting at
|
||||
the same leftmost position, the match corresponding to the pattern provided
|
||||
first by the caller is reported.
|
||||
* **leftmost-longest** is like leftmost-first, except when there are multiple
|
||||
matches starting at the same leftmost position, the pattern corresponding to
|
||||
the longest match is returned.
|
||||
|
||||
(The crate API documentation discusses these differences, with examples, in
|
||||
more depth on the `MatchKind` type.)
|
||||
|
||||
The reason why supporting these match semantics is important is because it
|
||||
gives the user more control over the match procedure. For example,
|
||||
leftmost-first permits users to implement match priority by simply putting the
|
||||
higher priority patterns first. Leftmost-longest, on the other hand, permits
|
||||
finding the longest possible match, which might be useful when trying to find
|
||||
words matching a dictionary. Additionally, regex engines often want to use
|
||||
Aho-Corasick as an optimization when searching for an alternation of literals.
|
||||
In order to preserve correct match semantics, regex engines typically can't use
|
||||
the standard textbook definition directly, since regex engines will implement
|
||||
either leftmost-first (Perl-like) or leftmost-longest (POSIX) match semantics.
|
||||
|
||||
Supporting leftmost semantics requires a couple key changes:
|
||||
|
||||
* Constructing the Aho-Corasick automaton changes a bit in both how the trie is
|
||||
constructed and how failure transitions are found. Namely, only a subset
|
||||
of the failure transitions are added. Specifically, only the failure
|
||||
transitions that either do not occur after a match or do occur after a match
|
||||
but preserve that match are kept. (More details on this can be found in
|
||||
`src/nfa/noncontiguous.rs`.)
|
||||
* The search algorithm changes slightly. Since we are looking for the leftmost
|
||||
match, we cannot quit as soon as a match is detected. Instead, after a match
|
||||
is detected, we must keep searching until either the end of the input or
|
||||
until a dead state is seen. (Dead states are not used for standard match
|
||||
semantics. Dead states mean that searching should stop after a match has been
|
||||
found.)
|
||||
|
||||
Most other implementations of Aho-Corasick do support leftmost match semantics,
|
||||
but they do it with more overhead at search time, or even worse, with a queue
|
||||
of matches and sophisticated hijinks to disambiguate the matches. While our
|
||||
construction algorithm becomes a bit more complicated, the correct match
|
||||
semantics fall out from the structure of the automaton itself.
|
||||
|
||||
|
||||
# Overlapping matches
|
||||
|
||||
One of the nice properties of an Aho-Corasick automaton is that it can report
|
||||
all possible matches, even when they overlap with one another. In this mode,
|
||||
the match semantics don't matter, since all possible matches are reported.
|
||||
Overlapping searches work just like regular searches, except the state
|
||||
identifier at which the previous search left off is carried over to the next
|
||||
search, so that it can pick up where it left off. If there are additional
|
||||
matches at that state, then they are reported before resuming the search.
|
||||
|
||||
Enabling leftmost-first or leftmost-longest match semantics causes the
|
||||
automaton to use a subset of all failure transitions, which means that
|
||||
overlapping searches cannot be used. Therefore, if leftmost match semantics are
|
||||
used, attempting to do an overlapping search will return an error (or panic
|
||||
when using the infallible APIs). Thus, to get overlapping searches, the caller
|
||||
must use the default standard match semantics. This behavior was chosen because
|
||||
there are only two alternatives, which were deemed worse:
|
||||
|
||||
* Compile two automatons internally, one for standard semantics and one for
|
||||
the semantics requested by the caller (if not standard).
|
||||
* Create a new type, distinct from the `AhoCorasick` type, which has different
|
||||
capabilities based on the configuration options.
|
||||
|
||||
The first is untenable because of the amount of memory used by the automaton.
|
||||
The second increases the complexity of the API too much by adding too many
|
||||
types that do similar things. It is conceptually much simpler to keep all
|
||||
searching isolated to a single type.
|
||||
|
||||
|
||||
# Stream searching
|
||||
|
||||
Since Aho-Corasick is an automaton, it is possible to do partial searches on
|
||||
partial parts of the haystack, and then resume that search on subsequent pieces
|
||||
of the haystack. This is useful when the haystack you're trying to search is
|
||||
not stored contiguously in memory, or if one does not want to read the entire
|
||||
haystack into memory at once.
|
||||
|
||||
Currently, only standard semantics are supported for stream searching. This is
|
||||
some of the more complicated code in this crate, and is something I would very
|
||||
much like to improve. In particular, it currently has the restriction that it
|
||||
must buffer at least enough of the haystack in memory in order to fit the
|
||||
longest possible match. The difficulty in getting stream searching right is
|
||||
that the implementation choices (such as the buffer size) often impact what the
|
||||
API looks like and what it's allowed to do.
|
||||
|
||||
|
||||
# Prefilters
|
||||
|
||||
In some cases, Aho-Corasick is not the fastest way to find matches containing
|
||||
multiple patterns. Sometimes, the search can be accelerated using highly
|
||||
optimized SIMD routines. For example, consider searching the following
|
||||
patterns:
|
||||
|
||||
Sherlock
|
||||
Moriarty
|
||||
Watson
|
||||
|
||||
It is plausible that it would be much faster to quickly look for occurrences of
|
||||
the leading bytes, `S`, `M` or `W`, before trying to start searching via the
|
||||
automaton. Indeed, this is exactly what this crate will do.
|
||||
|
||||
When there are more than three distinct starting bytes, then this crate will
|
||||
look for three distinct bytes occurring at any position in the patterns, while
|
||||
preferring bytes that are heuristically determined to be rare over others. For
|
||||
example:
|
||||
|
||||
Abuzz
|
||||
Sanchez
|
||||
Vasquez
|
||||
Topaz
|
||||
Waltz
|
||||
|
||||
Here, we have more than 3 distinct starting bytes, but all of the patterns
|
||||
contain `z`, which is typically a rare byte. In this case, the prefilter will
|
||||
scan for `z`, back up a bit, and then execute the Aho-Corasick automaton.
|
||||
|
||||
If all of that fails, then a packed multiple substring algorithm will be
|
||||
attempted. Currently, the only algorithm available for this is Teddy, but more
|
||||
may be added in the future. Teddy is unlike the above prefilters in that it
|
||||
confirms its own matches, so when Teddy is active, it might not be necessary
|
||||
for Aho-Corasick to run at all. However, the current Teddy implementation
|
||||
only works in `x86_64` when SSSE3 or AVX2 are available or in `aarch64`
|
||||
(using NEON), and moreover, only works _well_ when there are a small number
|
||||
of patterns (say, less than 100). Teddy also requires the haystack to be of a
|
||||
certain length (more than 16-34 bytes). When the haystack is shorter than that,
|
||||
Rabin-Karp is used instead. (See `src/packed/rabinkarp.rs`.)
|
||||
|
||||
There is a more thorough description of Teddy at
|
||||
[`src/packed/teddy/README.md`](src/packed/teddy/README.md).
|
21
vendor/aho-corasick/LICENSE-MIT
vendored
Normal file
21
vendor/aho-corasick/LICENSE-MIT
vendored
Normal file
@ -0,0 +1,21 @@
|
||||
The MIT License (MIT)
|
||||
|
||||
Copyright (c) 2015 Andrew Gallant
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
174
vendor/aho-corasick/README.md
vendored
Normal file
174
vendor/aho-corasick/README.md
vendored
Normal file
@ -0,0 +1,174 @@
|
||||
aho-corasick
|
||||
============
|
||||
A library for finding occurrences of many patterns at once with SIMD
|
||||
acceleration in some cases. This library provides multiple pattern
|
||||
search principally through an implementation of the
|
||||
[Aho-Corasick algorithm](https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm),
|
||||
which builds a finite state machine for executing searches in linear time.
|
||||
Features include case insensitive matching, overlapping matches, fast searching
|
||||
via SIMD and optional full DFA construction and search & replace in streams.
|
||||
|
||||
[](https://github.com/BurntSushi/aho-corasick/actions)
|
||||
[](https://crates.io/crates/aho-corasick)
|
||||
|
||||
Dual-licensed under MIT or the [UNLICENSE](https://unlicense.org/).
|
||||
|
||||
|
||||
### Documentation
|
||||
|
||||
https://docs.rs/aho-corasick
|
||||
|
||||
|
||||
### Usage
|
||||
|
||||
Run `cargo add aho-corasick` to automatically add this crate as a dependency
|
||||
in your `Cargo.toml` file.
|
||||
|
||||
|
||||
### Example: basic searching
|
||||
|
||||
This example shows how to search for occurrences of multiple patterns
|
||||
simultaneously. Each match includes the pattern that matched along with the
|
||||
byte offsets of the match.
|
||||
|
||||
```rust
|
||||
use aho_corasick::{AhoCorasick, PatternID};
|
||||
|
||||
let patterns = &["apple", "maple", "Snapple"];
|
||||
let haystack = "Nobody likes maple in their apple flavored Snapple.";
|
||||
|
||||
let ac = AhoCorasick::new(patterns).unwrap();
|
||||
let mut matches = vec![];
|
||||
for mat in ac.find_iter(haystack) {
|
||||
matches.push((mat.pattern(), mat.start(), mat.end()));
|
||||
}
|
||||
assert_eq!(matches, vec![
|
||||
(PatternID::must(1), 13, 18),
|
||||
(PatternID::must(0), 28, 33),
|
||||
(PatternID::must(2), 43, 50),
|
||||
]);
|
||||
```
|
||||
|
||||
|
||||
### Example: ASCII case insensitivity
|
||||
|
||||
This is like the previous example, but matches `Snapple` case insensitively
|
||||
using `AhoCorasickBuilder`:
|
||||
|
||||
```rust
|
||||
use aho_corasick::{AhoCorasick, PatternID};
|
||||
|
||||
let patterns = &["apple", "maple", "snapple"];
|
||||
let haystack = "Nobody likes maple in their apple flavored Snapple.";
|
||||
|
||||
let ac = AhoCorasick::builder()
|
||||
.ascii_case_insensitive(true)
|
||||
.build(patterns)
|
||||
.unwrap();
|
||||
let mut matches = vec![];
|
||||
for mat in ac.find_iter(haystack) {
|
||||
matches.push((mat.pattern(), mat.start(), mat.end()));
|
||||
}
|
||||
assert_eq!(matches, vec![
|
||||
(PatternID::must(1), 13, 18),
|
||||
(PatternID::must(0), 28, 33),
|
||||
(PatternID::must(2), 43, 50),
|
||||
]);
|
||||
```
|
||||
|
||||
|
||||
### Example: replacing matches in a stream
|
||||
|
||||
This example shows how to execute a search and replace on a stream without
|
||||
loading the entire stream into memory first.
|
||||
|
||||
```rust,ignore
|
||||
use aho_corasick::AhoCorasick;
|
||||
|
||||
let patterns = &["fox", "brown", "quick"];
|
||||
let replace_with = &["sloth", "grey", "slow"];
|
||||
|
||||
// In a real example, these might be `std::fs::File`s instead. All you need to
|
||||
// do is supply a pair of `std::io::Read` and `std::io::Write` implementations.
|
||||
let rdr = "The quick brown fox.";
|
||||
let mut wtr = vec![];
|
||||
|
||||
let ac = AhoCorasick::new(patterns).unwrap();
|
||||
ac.stream_replace_all(rdr.as_bytes(), &mut wtr, replace_with)
|
||||
.expect("stream_replace_all failed");
|
||||
assert_eq!(b"The slow grey sloth.".to_vec(), wtr);
|
||||
```
|
||||
|
||||
|
||||
### Example: finding the leftmost first match
|
||||
|
||||
In the textbook description of Aho-Corasick, its formulation is typically
|
||||
structured such that it reports all possible matches, even when they overlap
|
||||
with another. In many cases, overlapping matches may not be desired, such as
|
||||
the case of finding all successive non-overlapping matches like you might with
|
||||
a standard regular expression.
|
||||
|
||||
Unfortunately the "obvious" way to modify the Aho-Corasick algorithm to do
|
||||
this doesn't always work in the expected way, since it will report matches as
|
||||
soon as they are seen. For example, consider matching the regex `Samwise|Sam`
|
||||
against the text `Samwise`. Most regex engines (that are Perl-like, or
|
||||
non-POSIX) will report `Samwise` as a match, but the standard Aho-Corasick
|
||||
algorithm modified for reporting non-overlapping matches will report `Sam`.
|
||||
|
||||
A novel contribution of this library is the ability to change the match
|
||||
semantics of Aho-Corasick (without additional search time overhead) such that
|
||||
`Samwise` is reported instead. For example, here's the standard approach:
|
||||
|
||||
```rust
|
||||
use aho_corasick::AhoCorasick;
|
||||
|
||||
let patterns = &["Samwise", "Sam"];
|
||||
let haystack = "Samwise";
|
||||
|
||||
let ac = AhoCorasick::new(patterns).unwrap();
|
||||
let mat = ac.find(haystack).expect("should have a match");
|
||||
assert_eq!("Sam", &haystack[mat.start()..mat.end()]);
|
||||
```
|
||||
|
||||
And now here's the leftmost-first version, which matches how a Perl-like
|
||||
regex will work:
|
||||
|
||||
```rust
|
||||
use aho_corasick::{AhoCorasick, MatchKind};
|
||||
|
||||
let patterns = &["Samwise", "Sam"];
|
||||
let haystack = "Samwise";
|
||||
|
||||
let ac = AhoCorasick::builder()
|
||||
.match_kind(MatchKind::LeftmostFirst)
|
||||
.build(patterns)
|
||||
.unwrap();
|
||||
let mat = ac.find(haystack).expect("should have a match");
|
||||
assert_eq!("Samwise", &haystack[mat.start()..mat.end()]);
|
||||
```
|
||||
|
||||
In addition to leftmost-first semantics, this library also supports
|
||||
leftmost-longest semantics, which match the POSIX behavior of a regular
|
||||
expression alternation. See `MatchKind` in the docs for more details.
|
||||
|
||||
|
||||
### Minimum Rust version policy
|
||||
|
||||
This crate's minimum supported `rustc` version is `1.60.0`.
|
||||
|
||||
The current policy is that the minimum Rust version required to use this crate
|
||||
can be increased in minor version updates. For example, if `crate 1.0` requires
|
||||
Rust 1.20.0, then `crate 1.0.z` for all values of `z` will also require Rust
|
||||
1.20.0 or newer. However, `crate 1.y` for `y > 0` may require a newer minimum
|
||||
version of Rust.
|
||||
|
||||
In general, this crate will be conservative with respect to the minimum
|
||||
supported version of Rust.
|
||||
|
||||
|
||||
### FFI bindings
|
||||
|
||||
* [G-Research/ahocorasick_rs](https://github.com/G-Research/ahocorasick_rs/)
|
||||
is a Python wrapper for this library.
|
||||
* [tmikus/ahocorasick_rs](https://github.com/tmikus/ahocorasick_rs) is a Go
|
||||
wrapper for this library.
|
24
vendor/aho-corasick/UNLICENSE
vendored
Normal file
24
vendor/aho-corasick/UNLICENSE
vendored
Normal file
@ -0,0 +1,24 @@
|
||||
This is free and unencumbered software released into the public domain.
|
||||
|
||||
Anyone is free to copy, modify, publish, use, compile, sell, or
|
||||
distribute this software, either in source code form or as a compiled
|
||||
binary, for any purpose, commercial or non-commercial, and by any
|
||||
means.
|
||||
|
||||
In jurisdictions that recognize copyright laws, the author or authors
|
||||
of this software dedicate any and all copyright interest in the
|
||||
software to the public domain. We make this dedication for the benefit
|
||||
of the public at large and to the detriment of our heirs and
|
||||
successors. We intend this dedication to be an overt act of
|
||||
relinquishment in perpetuity of all present and future rights to this
|
||||
software under copyright law.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||
OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
OTHER DEALINGS IN THE SOFTWARE.
|
||||
|
||||
For more information, please refer to <http://unlicense.org/>
|
2
vendor/aho-corasick/rustfmt.toml
vendored
Normal file
2
vendor/aho-corasick/rustfmt.toml
vendored
Normal file
@ -0,0 +1,2 @@
|
||||
max_width = 79
|
||||
use_small_heuristics = "max"
|
2789
vendor/aho-corasick/src/ahocorasick.rs
vendored
Normal file
2789
vendor/aho-corasick/src/ahocorasick.rs
vendored
Normal file
File diff suppressed because it is too large
Load Diff
1608
vendor/aho-corasick/src/automaton.rs
vendored
Normal file
1608
vendor/aho-corasick/src/automaton.rs
vendored
Normal file
File diff suppressed because it is too large
Load Diff
835
vendor/aho-corasick/src/dfa.rs
vendored
Normal file
835
vendor/aho-corasick/src/dfa.rs
vendored
Normal file
@ -0,0 +1,835 @@
|
||||
/*!
|
||||
Provides direct access to a DFA implementation of Aho-Corasick.
|
||||
|
||||
This is a low-level API that generally only needs to be used in niche
|
||||
circumstances. When possible, prefer using [`AhoCorasick`](crate::AhoCorasick)
|
||||
instead of a DFA directly. Using an `DFA` directly is typically only necessary
|
||||
when one needs access to the [`Automaton`] trait implementation.
|
||||
*/
|
||||
|
||||
use alloc::{vec, vec::Vec};
|
||||
|
||||
use crate::{
|
||||
automaton::Automaton,
|
||||
nfa::noncontiguous,
|
||||
util::{
|
||||
alphabet::ByteClasses,
|
||||
error::{BuildError, MatchError},
|
||||
int::{Usize, U32},
|
||||
prefilter::Prefilter,
|
||||
primitives::{IteratorIndexExt, PatternID, SmallIndex, StateID},
|
||||
search::{Anchored, MatchKind, StartKind},
|
||||
special::Special,
|
||||
},
|
||||
};
|
||||
|
||||
/// A DFA implementation of Aho-Corasick.
|
||||
///
|
||||
/// When possible, prefer using [`AhoCorasick`](crate::AhoCorasick) instead of
|
||||
/// this type directly. Using a `DFA` directly is typically only necessary when
|
||||
/// one needs access to the [`Automaton`] trait implementation.
|
||||
///
|
||||
/// This DFA can only be built by first constructing a [`noncontiguous::NFA`].
|
||||
/// Both [`DFA::new`] and [`Builder::build`] do this for you automatically, but
|
||||
/// [`Builder::build_from_noncontiguous`] permits doing it explicitly.
|
||||
///
|
||||
/// A DFA provides the best possible search performance (in this crate) via two
|
||||
/// mechanisms:
|
||||
///
|
||||
/// * All states use a dense representation for their transitions.
|
||||
/// * All failure transitions are pre-computed such that they are never
|
||||
/// explicitly handled at search time.
|
||||
///
|
||||
/// These two facts combined mean that every state transition is performed
|
||||
/// using a constant number of instructions. However, this comes at
|
||||
/// great cost. The memory usage of a DFA can be quite exorbitant.
|
||||
/// It is potentially multiple orders of magnitude greater than a
|
||||
/// [`contiguous::NFA`](crate::nfa::contiguous::NFA) for example. In exchange,
|
||||
/// a DFA will typically have better search speed than a `contiguous::NFA`, but
|
||||
/// not by orders of magnitude.
|
||||
///
|
||||
/// Unless you have a small number of patterns or memory usage is not a concern
|
||||
/// and search performance is critical, a DFA is usually not the best choice.
|
||||
///
|
||||
/// Moreover, unlike the NFAs in this crate, it is costly for a DFA to
|
||||
/// support for anchored and unanchored search configurations. Namely,
|
||||
/// since failure transitions are pre-computed, supporting both anchored
|
||||
/// and unanchored searches requires a duplication of the transition table,
|
||||
/// making the memory usage of such a DFA ever bigger. (The NFAs in this crate
|
||||
/// unconditionally support both anchored and unanchored searches because there
|
||||
/// is essentially no added cost for doing so.) It is for this reason that
|
||||
/// a DFA's support for anchored and unanchored searches can be configured
|
||||
/// via [`Builder::start_kind`]. By default, a DFA only supports unanchored
|
||||
/// searches.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// This example shows how to build an `DFA` directly and use it to execute
|
||||
/// [`Automaton::try_find`]:
|
||||
///
|
||||
/// ```
|
||||
/// use aho_corasick::{
|
||||
/// automaton::Automaton,
|
||||
/// dfa::DFA,
|
||||
/// Input, Match,
|
||||
/// };
|
||||
///
|
||||
/// let patterns = &["b", "abc", "abcd"];
|
||||
/// let haystack = "abcd";
|
||||
///
|
||||
/// let nfa = DFA::new(patterns).unwrap();
|
||||
/// assert_eq!(
|
||||
/// Some(Match::must(0, 1..2)),
|
||||
/// nfa.try_find(&Input::new(haystack))?,
|
||||
/// );
|
||||
/// # Ok::<(), Box<dyn std::error::Error>>(())
|
||||
/// ```
|
||||
///
|
||||
/// It is also possible to implement your own version of `try_find`. See the
|
||||
/// [`Automaton`] documentation for an example.
|
||||
#[derive(Clone)]
|
||||
pub struct DFA {
|
||||
/// The DFA transition table. IDs in this table are pre-multiplied. So
|
||||
/// instead of the IDs being 0, 1, 2, 3, ..., they are 0*stride, 1*stride,
|
||||
/// 2*stride, 3*stride, ...
|
||||
trans: Vec<StateID>,
|
||||
/// The matches for every match state in this DFA. This is first indexed by
|
||||
/// state index (so that's `sid >> stride2`) and then by order in which the
|
||||
/// matches are meant to occur.
|
||||
matches: Vec<Vec<PatternID>>,
|
||||
/// The amount of heap memory used, in bytes, by the inner Vecs of
|
||||
/// 'matches'.
|
||||
matches_memory_usage: usize,
|
||||
/// The length of each pattern. This is used to compute the start offset
|
||||
/// of a match.
|
||||
pattern_lens: Vec<SmallIndex>,
|
||||
/// A prefilter for accelerating searches, if one exists.
|
||||
prefilter: Option<Prefilter>,
|
||||
/// The match semantics built into this DFA.
|
||||
match_kind: MatchKind,
|
||||
/// The total number of states in this DFA.
|
||||
state_len: usize,
|
||||
/// The alphabet size, or total number of equivalence classes, for this
|
||||
/// DFA. Note that the actual number of transitions in each state is
|
||||
/// stride=2^stride2, where stride is the smallest power of 2 greater than
|
||||
/// or equal to alphabet_len. We do things this way so that we can use
|
||||
/// bitshifting to go from a state ID to an index into 'matches'.
|
||||
alphabet_len: usize,
|
||||
/// The exponent with a base 2, such that stride=2^stride2. Given a state
|
||||
/// index 'i', its state identifier is 'i << stride2'. Given a state
|
||||
/// identifier 'sid', its state index is 'sid >> stride2'.
|
||||
stride2: usize,
|
||||
/// The equivalence classes for this DFA. All transitions are defined on
|
||||
/// equivalence classes and not on the 256 distinct byte values.
|
||||
byte_classes: ByteClasses,
|
||||
/// The length of the shortest pattern in this automaton.
|
||||
min_pattern_len: usize,
|
||||
/// The length of the longest pattern in this automaton.
|
||||
max_pattern_len: usize,
|
||||
/// The information required to deduce which states are "special" in this
|
||||
/// DFA.
|
||||
special: Special,
|
||||
}
|
||||
|
||||
impl DFA {
|
||||
/// Create a new Aho-Corasick DFA using the default configuration.
|
||||
///
|
||||
/// Use a [`Builder`] if you want to change the configuration.
|
||||
pub fn new<I, P>(patterns: I) -> Result<DFA, BuildError>
|
||||
where
|
||||
I: IntoIterator<Item = P>,
|
||||
P: AsRef<[u8]>,
|
||||
{
|
||||
DFA::builder().build(patterns)
|
||||
}
|
||||
|
||||
/// A convenience method for returning a new Aho-Corasick DFA builder.
|
||||
///
|
||||
/// This usually permits one to just import the `DFA` type.
|
||||
pub fn builder() -> Builder {
|
||||
Builder::new()
|
||||
}
|
||||
}
|
||||
|
||||
impl DFA {
|
||||
/// A sentinel state ID indicating that a search should stop once it has
|
||||
/// entered this state. When a search stops, it returns a match if one has
|
||||
/// been found, otherwise no match. A DFA always has an actual dead state
|
||||
/// at this ID.
|
||||
///
|
||||
/// N.B. DFAs, unlike NFAs, do not have any notion of a FAIL state.
|
||||
/// Namely, the whole point of a DFA is that the FAIL state is completely
|
||||
/// compiled away. That is, DFA construction involves pre-computing the
|
||||
/// failure transitions everywhere, such that failure transitions are no
|
||||
/// longer used at search time. This, combined with its uniformly dense
|
||||
/// representation, are the two most important factors in why it's faster
|
||||
/// than the NFAs in this crate.
|
||||
const DEAD: StateID = StateID::new_unchecked(0);
|
||||
|
||||
/// Adds the given pattern IDs as matches to the given state and also
|
||||
/// records the added memory usage.
|
||||
fn set_matches(
|
||||
&mut self,
|
||||
sid: StateID,
|
||||
pids: impl Iterator<Item = PatternID>,
|
||||
) {
|
||||
let index = (sid.as_usize() >> self.stride2).checked_sub(2).unwrap();
|
||||
let mut at_least_one = false;
|
||||
for pid in pids {
|
||||
self.matches[index].push(pid);
|
||||
self.matches_memory_usage += PatternID::SIZE;
|
||||
at_least_one = true;
|
||||
}
|
||||
assert!(at_least_one, "match state must have non-empty pids");
|
||||
}
|
||||
}
|
||||
|
||||
// SAFETY: 'start_state' always returns a valid state ID, 'next_state' always
|
||||
// returns a valid state ID given a valid state ID. We otherwise claim that
|
||||
// all other methods are correct as well.
|
||||
unsafe impl Automaton for DFA {
|
||||
#[inline(always)]
|
||||
fn start_state(&self, anchored: Anchored) -> Result<StateID, MatchError> {
|
||||
// Either of the start state IDs can be DEAD, in which case, support
|
||||
// for that type of search is not provided by this DFA. Which start
|
||||
// state IDs are inactive depends on the 'StartKind' configuration at
|
||||
// DFA construction time.
|
||||
match anchored {
|
||||
Anchored::No => {
|
||||
let start = self.special.start_unanchored_id;
|
||||
if start == DFA::DEAD {
|
||||
Err(MatchError::invalid_input_unanchored())
|
||||
} else {
|
||||
Ok(start)
|
||||
}
|
||||
}
|
||||
Anchored::Yes => {
|
||||
let start = self.special.start_anchored_id;
|
||||
if start == DFA::DEAD {
|
||||
Err(MatchError::invalid_input_anchored())
|
||||
} else {
|
||||
Ok(start)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn next_state(
|
||||
&self,
|
||||
_anchored: Anchored,
|
||||
sid: StateID,
|
||||
byte: u8,
|
||||
) -> StateID {
|
||||
let class = self.byte_classes.get(byte);
|
||||
self.trans[(sid.as_u32() + u32::from(class)).as_usize()]
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn is_special(&self, sid: StateID) -> bool {
|
||||
sid <= self.special.max_special_id
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn is_dead(&self, sid: StateID) -> bool {
|
||||
sid == DFA::DEAD
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn is_match(&self, sid: StateID) -> bool {
|
||||
!self.is_dead(sid) && sid <= self.special.max_match_id
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn is_start(&self, sid: StateID) -> bool {
|
||||
sid == self.special.start_unanchored_id
|
||||
|| sid == self.special.start_anchored_id
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn match_kind(&self) -> MatchKind {
|
||||
self.match_kind
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn patterns_len(&self) -> usize {
|
||||
self.pattern_lens.len()
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn pattern_len(&self, pid: PatternID) -> usize {
|
||||
self.pattern_lens[pid].as_usize()
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn min_pattern_len(&self) -> usize {
|
||||
self.min_pattern_len
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn max_pattern_len(&self) -> usize {
|
||||
self.max_pattern_len
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn match_len(&self, sid: StateID) -> usize {
|
||||
debug_assert!(self.is_match(sid));
|
||||
let offset = (sid.as_usize() >> self.stride2) - 2;
|
||||
self.matches[offset].len()
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn match_pattern(&self, sid: StateID, index: usize) -> PatternID {
|
||||
debug_assert!(self.is_match(sid));
|
||||
let offset = (sid.as_usize() >> self.stride2) - 2;
|
||||
self.matches[offset][index]
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn memory_usage(&self) -> usize {
|
||||
use core::mem::size_of;
|
||||
|
||||
(self.trans.len() * size_of::<u32>())
|
||||
+ (self.matches.len() * size_of::<Vec<PatternID>>())
|
||||
+ self.matches_memory_usage
|
||||
+ (self.pattern_lens.len() * size_of::<SmallIndex>())
|
||||
+ self.prefilter.as_ref().map_or(0, |p| p.memory_usage())
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn prefilter(&self) -> Option<&Prefilter> {
|
||||
self.prefilter.as_ref()
|
||||
}
|
||||
}
|
||||
|
||||
impl core::fmt::Debug for DFA {
|
||||
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
|
||||
use crate::{
|
||||
automaton::{fmt_state_indicator, sparse_transitions},
|
||||
util::debug::DebugByte,
|
||||
};
|
||||
|
||||
writeln!(f, "dfa::DFA(")?;
|
||||
for index in 0..self.state_len {
|
||||
let sid = StateID::new_unchecked(index << self.stride2);
|
||||
// While we do currently include the FAIL state in the transition
|
||||
// table (to simplify construction), it is never actually used. It
|
||||
// poses problems with the code below because it gets treated as
|
||||
// a match state incidentally when it is, of course, not. So we
|
||||
// special case it. The fail state is always the first state after
|
||||
// the dead state.
|
||||
//
|
||||
// If the construction is changed to remove the fail state (it
|
||||
// probably should be), then this special case should be updated.
|
||||
if index == 1 {
|
||||
writeln!(f, "F {:06}:", sid.as_usize())?;
|
||||
continue;
|
||||
}
|
||||
fmt_state_indicator(f, self, sid)?;
|
||||
write!(f, "{:06}: ", sid.as_usize())?;
|
||||
|
||||
let it = (0..self.byte_classes.alphabet_len()).map(|class| {
|
||||
(class.as_u8(), self.trans[sid.as_usize() + class])
|
||||
});
|
||||
for (i, (start, end, next)) in sparse_transitions(it).enumerate() {
|
||||
if i > 0 {
|
||||
write!(f, ", ")?;
|
||||
}
|
||||
if start == end {
|
||||
write!(
|
||||
f,
|
||||
"{:?} => {:?}",
|
||||
DebugByte(start),
|
||||
next.as_usize()
|
||||
)?;
|
||||
} else {
|
||||
write!(
|
||||
f,
|
||||
"{:?}-{:?} => {:?}",
|
||||
DebugByte(start),
|
||||
DebugByte(end),
|
||||
next.as_usize()
|
||||
)?;
|
||||
}
|
||||
}
|
||||
write!(f, "\n")?;
|
||||
if self.is_match(sid) {
|
||||
write!(f, " matches: ")?;
|
||||
for i in 0..self.match_len(sid) {
|
||||
if i > 0 {
|
||||
write!(f, ", ")?;
|
||||
}
|
||||
let pid = self.match_pattern(sid, i);
|
||||
write!(f, "{}", pid.as_usize())?;
|
||||
}
|
||||
write!(f, "\n")?;
|
||||
}
|
||||
}
|
||||
writeln!(f, "match kind: {:?}", self.match_kind)?;
|
||||
writeln!(f, "prefilter: {:?}", self.prefilter.is_some())?;
|
||||
writeln!(f, "state length: {:?}", self.state_len)?;
|
||||
writeln!(f, "pattern length: {:?}", self.patterns_len())?;
|
||||
writeln!(f, "shortest pattern length: {:?}", self.min_pattern_len)?;
|
||||
writeln!(f, "longest pattern length: {:?}", self.max_pattern_len)?;
|
||||
writeln!(f, "alphabet length: {:?}", self.alphabet_len)?;
|
||||
writeln!(f, "stride: {:?}", 1 << self.stride2)?;
|
||||
writeln!(f, "byte classes: {:?}", self.byte_classes)?;
|
||||
writeln!(f, "memory usage: {:?}", self.memory_usage())?;
|
||||
writeln!(f, ")")?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// A builder for configuring an Aho-Corasick DFA.
|
||||
///
|
||||
/// This builder has a subset of the options available to a
|
||||
/// [`AhoCorasickBuilder`](crate::AhoCorasickBuilder). Of the shared options,
|
||||
/// their behavior is identical.
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct Builder {
|
||||
noncontiguous: noncontiguous::Builder,
|
||||
start_kind: StartKind,
|
||||
byte_classes: bool,
|
||||
}
|
||||
|
||||
impl Default for Builder {
|
||||
fn default() -> Builder {
|
||||
Builder {
|
||||
noncontiguous: noncontiguous::Builder::new(),
|
||||
start_kind: StartKind::Unanchored,
|
||||
byte_classes: true,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Builder {
|
||||
/// Create a new builder for configuring an Aho-Corasick DFA.
|
||||
pub fn new() -> Builder {
|
||||
Builder::default()
|
||||
}
|
||||
|
||||
/// Build an Aho-Corasick DFA from the given iterator of patterns.
|
||||
///
|
||||
/// A builder may be reused to create more DFAs.
|
||||
pub fn build<I, P>(&self, patterns: I) -> Result<DFA, BuildError>
|
||||
where
|
||||
I: IntoIterator<Item = P>,
|
||||
P: AsRef<[u8]>,
|
||||
{
|
||||
let nnfa = self.noncontiguous.build(patterns)?;
|
||||
self.build_from_noncontiguous(&nnfa)
|
||||
}
|
||||
|
||||
/// Build an Aho-Corasick DFA from the given noncontiguous NFA.
|
||||
///
|
||||
/// Note that when this method is used, only the `start_kind` and
|
||||
/// `byte_classes` settings on this builder are respected. The other
|
||||
/// settings only apply to the initial construction of the Aho-Corasick
|
||||
/// automaton. Since using this method requires that initial construction
|
||||
/// has already completed, all settings impacting only initial construction
|
||||
/// are no longer relevant.
|
||||
pub fn build_from_noncontiguous(
|
||||
&self,
|
||||
nnfa: &noncontiguous::NFA,
|
||||
) -> Result<DFA, BuildError> {
|
||||
debug!("building DFA");
|
||||
let byte_classes = if self.byte_classes {
|
||||
nnfa.byte_classes().clone()
|
||||
} else {
|
||||
ByteClasses::singletons()
|
||||
};
|
||||
let state_len = match self.start_kind {
|
||||
StartKind::Unanchored | StartKind::Anchored => nnfa.states().len(),
|
||||
StartKind::Both => {
|
||||
// These unwraps are OK because we know that the number of
|
||||
// NFA states is < StateID::LIMIT which is in turn less than
|
||||
// i32::MAX. Thus, there is always room to multiply by 2.
|
||||
// Finally, the number of states is always at least 4 in the
|
||||
// NFA (DEAD, FAIL, START-UNANCHORED, START-ANCHORED), so the
|
||||
// subtraction of 4 is okay.
|
||||
//
|
||||
// Note that we subtract 4 because the "anchored" part of
|
||||
// the DFA duplicates the unanchored part (without failure
|
||||
// transitions), but reuses the DEAD, FAIL and START states.
|
||||
nnfa.states()
|
||||
.len()
|
||||
.checked_mul(2)
|
||||
.unwrap()
|
||||
.checked_sub(4)
|
||||
.unwrap()
|
||||
}
|
||||
};
|
||||
let trans_len =
|
||||
match state_len.checked_shl(byte_classes.stride2().as_u32()) {
|
||||
Some(trans_len) => trans_len,
|
||||
None => {
|
||||
return Err(BuildError::state_id_overflow(
|
||||
StateID::MAX.as_u64(),
|
||||
usize::MAX.as_u64(),
|
||||
))
|
||||
}
|
||||
};
|
||||
StateID::new(trans_len.checked_sub(byte_classes.stride()).unwrap())
|
||||
.map_err(|e| {
|
||||
BuildError::state_id_overflow(
|
||||
StateID::MAX.as_u64(),
|
||||
e.attempted(),
|
||||
)
|
||||
})?;
|
||||
let num_match_states = match self.start_kind {
|
||||
StartKind::Unanchored | StartKind::Anchored => {
|
||||
nnfa.special().max_match_id.as_usize().checked_sub(1).unwrap()
|
||||
}
|
||||
StartKind::Both => nnfa
|
||||
.special()
|
||||
.max_match_id
|
||||
.as_usize()
|
||||
.checked_sub(1)
|
||||
.unwrap()
|
||||
.checked_mul(2)
|
||||
.unwrap(),
|
||||
};
|
||||
let mut dfa = DFA {
|
||||
trans: vec![DFA::DEAD; trans_len],
|
||||
matches: vec![vec![]; num_match_states],
|
||||
matches_memory_usage: 0,
|
||||
pattern_lens: nnfa.pattern_lens_raw().to_vec(),
|
||||
prefilter: nnfa.prefilter().map(|p| p.clone()),
|
||||
match_kind: nnfa.match_kind(),
|
||||
state_len,
|
||||
alphabet_len: byte_classes.alphabet_len(),
|
||||
stride2: byte_classes.stride2(),
|
||||
byte_classes,
|
||||
min_pattern_len: nnfa.min_pattern_len(),
|
||||
max_pattern_len: nnfa.max_pattern_len(),
|
||||
// The special state IDs are set later.
|
||||
special: Special::zero(),
|
||||
};
|
||||
match self.start_kind {
|
||||
StartKind::Both => {
|
||||
self.finish_build_both_starts(nnfa, &mut dfa);
|
||||
}
|
||||
StartKind::Unanchored => {
|
||||
self.finish_build_one_start(Anchored::No, nnfa, &mut dfa);
|
||||
}
|
||||
StartKind::Anchored => {
|
||||
self.finish_build_one_start(Anchored::Yes, nnfa, &mut dfa)
|
||||
}
|
||||
}
|
||||
debug!(
|
||||
"DFA built, <states: {:?}, size: {:?}, \
|
||||
alphabet len: {:?}, stride: {:?}>",
|
||||
dfa.state_len,
|
||||
dfa.memory_usage(),
|
||||
dfa.byte_classes.alphabet_len(),
|
||||
dfa.byte_classes.stride(),
|
||||
);
|
||||
// The vectors can grow ~twice as big during construction because a
|
||||
// Vec amortizes growth. But here, let's shrink things back down to
|
||||
// what we actually need since we're never going to add more to it.
|
||||
dfa.trans.shrink_to_fit();
|
||||
dfa.pattern_lens.shrink_to_fit();
|
||||
dfa.matches.shrink_to_fit();
|
||||
// TODO: We might also want to shrink each Vec inside of `dfa.matches`,
|
||||
// or even better, convert it to one contiguous allocation. But I think
|
||||
// I went with nested allocs for good reason (can't remember), so this
|
||||
// may be tricky to do. I decided not to shrink them here because it
|
||||
// might require a fair bit of work to do. It's unclear whether it's
|
||||
// worth it.
|
||||
Ok(dfa)
|
||||
}
|
||||
|
||||
/// Finishes building a DFA for either unanchored or anchored searches,
|
||||
/// but NOT both.
|
||||
fn finish_build_one_start(
|
||||
&self,
|
||||
anchored: Anchored,
|
||||
nnfa: &noncontiguous::NFA,
|
||||
dfa: &mut DFA,
|
||||
) {
|
||||
// This function always succeeds because we check above that all of the
|
||||
// states in the NFA can be mapped to DFA state IDs.
|
||||
let stride2 = dfa.stride2;
|
||||
let old2new = |oldsid: StateID| {
|
||||
StateID::new_unchecked(oldsid.as_usize() << stride2)
|
||||
};
|
||||
for (oldsid, state) in nnfa.states().iter().with_state_ids() {
|
||||
let newsid = old2new(oldsid);
|
||||
if state.is_match() {
|
||||
dfa.set_matches(newsid, nnfa.iter_matches(oldsid));
|
||||
}
|
||||
sparse_iter(
|
||||
nnfa,
|
||||
oldsid,
|
||||
&dfa.byte_classes,
|
||||
|byte, class, mut oldnextsid| {
|
||||
if oldnextsid == noncontiguous::NFA::FAIL {
|
||||
if anchored.is_anchored() {
|
||||
oldnextsid = noncontiguous::NFA::DEAD;
|
||||
} else if state.fail() == noncontiguous::NFA::DEAD {
|
||||
// This is a special case that avoids following
|
||||
// DEAD transitions in a non-contiguous NFA.
|
||||
// Following these transitions is pretty slow
|
||||
// because the non-contiguous NFA will always use
|
||||
// a sparse representation for it (because the
|
||||
// DEAD state is usually treated as a sentinel).
|
||||
// The *vast* majority of failure states are DEAD
|
||||
// states, so this winds up being pretty slow if
|
||||
// we go through the non-contiguous NFA state
|
||||
// transition logic. Instead, just do it ourselves.
|
||||
oldnextsid = noncontiguous::NFA::DEAD;
|
||||
} else {
|
||||
oldnextsid = nnfa.next_state(
|
||||
Anchored::No,
|
||||
state.fail(),
|
||||
byte,
|
||||
);
|
||||
}
|
||||
}
|
||||
dfa.trans[newsid.as_usize() + usize::from(class)] =
|
||||
old2new(oldnextsid);
|
||||
},
|
||||
);
|
||||
}
|
||||
// Now that we've remapped all the IDs in our states, all that's left
|
||||
// is remapping the special state IDs.
|
||||
let old = nnfa.special();
|
||||
let new = &mut dfa.special;
|
||||
new.max_special_id = old2new(old.max_special_id);
|
||||
new.max_match_id = old2new(old.max_match_id);
|
||||
if anchored.is_anchored() {
|
||||
new.start_unanchored_id = DFA::DEAD;
|
||||
new.start_anchored_id = old2new(old.start_anchored_id);
|
||||
} else {
|
||||
new.start_unanchored_id = old2new(old.start_unanchored_id);
|
||||
new.start_anchored_id = DFA::DEAD;
|
||||
}
|
||||
}
|
||||
|
||||
/// Finishes building a DFA that supports BOTH unanchored and anchored
|
||||
/// searches. It works by inter-leaving unanchored states with anchored
|
||||
/// states in the same transition table. This way, we avoid needing to
|
||||
/// re-shuffle states afterward to ensure that our states still look like
|
||||
/// DEAD, MATCH, ..., START-UNANCHORED, START-ANCHORED, NON-MATCH, ...
|
||||
///
|
||||
/// Honestly this is pretty inscrutable... Simplifications are most
|
||||
/// welcome.
|
||||
fn finish_build_both_starts(
|
||||
&self,
|
||||
nnfa: &noncontiguous::NFA,
|
||||
dfa: &mut DFA,
|
||||
) {
|
||||
let stride2 = dfa.stride2;
|
||||
let stride = 1 << stride2;
|
||||
let mut remap_unanchored = vec![DFA::DEAD; nnfa.states().len()];
|
||||
let mut remap_anchored = vec![DFA::DEAD; nnfa.states().len()];
|
||||
let mut is_anchored = vec![false; dfa.state_len];
|
||||
let mut newsid = DFA::DEAD;
|
||||
let next_dfa_id =
|
||||
|sid: StateID| StateID::new_unchecked(sid.as_usize() + stride);
|
||||
for (oldsid, state) in nnfa.states().iter().with_state_ids() {
|
||||
if oldsid == noncontiguous::NFA::DEAD
|
||||
|| oldsid == noncontiguous::NFA::FAIL
|
||||
{
|
||||
remap_unanchored[oldsid] = newsid;
|
||||
remap_anchored[oldsid] = newsid;
|
||||
newsid = next_dfa_id(newsid);
|
||||
} else if oldsid == nnfa.special().start_unanchored_id
|
||||
|| oldsid == nnfa.special().start_anchored_id
|
||||
{
|
||||
if oldsid == nnfa.special().start_unanchored_id {
|
||||
remap_unanchored[oldsid] = newsid;
|
||||
remap_anchored[oldsid] = DFA::DEAD;
|
||||
} else {
|
||||
remap_unanchored[oldsid] = DFA::DEAD;
|
||||
remap_anchored[oldsid] = newsid;
|
||||
is_anchored[newsid.as_usize() >> stride2] = true;
|
||||
}
|
||||
if state.is_match() {
|
||||
dfa.set_matches(newsid, nnfa.iter_matches(oldsid));
|
||||
}
|
||||
sparse_iter(
|
||||
nnfa,
|
||||
oldsid,
|
||||
&dfa.byte_classes,
|
||||
|_, class, oldnextsid| {
|
||||
let class = usize::from(class);
|
||||
if oldnextsid == noncontiguous::NFA::FAIL {
|
||||
dfa.trans[newsid.as_usize() + class] = DFA::DEAD;
|
||||
} else {
|
||||
dfa.trans[newsid.as_usize() + class] = oldnextsid;
|
||||
}
|
||||
},
|
||||
);
|
||||
newsid = next_dfa_id(newsid);
|
||||
} else {
|
||||
let unewsid = newsid;
|
||||
newsid = next_dfa_id(newsid);
|
||||
let anewsid = newsid;
|
||||
newsid = next_dfa_id(newsid);
|
||||
|
||||
remap_unanchored[oldsid] = unewsid;
|
||||
remap_anchored[oldsid] = anewsid;
|
||||
is_anchored[anewsid.as_usize() >> stride2] = true;
|
||||
if state.is_match() {
|
||||
dfa.set_matches(unewsid, nnfa.iter_matches(oldsid));
|
||||
dfa.set_matches(anewsid, nnfa.iter_matches(oldsid));
|
||||
}
|
||||
sparse_iter(
|
||||
nnfa,
|
||||
oldsid,
|
||||
&dfa.byte_classes,
|
||||
|byte, class, oldnextsid| {
|
||||
let class = usize::from(class);
|
||||
if oldnextsid == noncontiguous::NFA::FAIL {
|
||||
let oldnextsid =
|
||||
if state.fail() == noncontiguous::NFA::DEAD {
|
||||
noncontiguous::NFA::DEAD
|
||||
} else {
|
||||
nnfa.next_state(
|
||||
Anchored::No,
|
||||
state.fail(),
|
||||
byte,
|
||||
)
|
||||
};
|
||||
dfa.trans[unewsid.as_usize() + class] = oldnextsid;
|
||||
} else {
|
||||
dfa.trans[unewsid.as_usize() + class] = oldnextsid;
|
||||
dfa.trans[anewsid.as_usize() + class] = oldnextsid;
|
||||
}
|
||||
},
|
||||
);
|
||||
}
|
||||
}
|
||||
for i in 0..dfa.state_len {
|
||||
let sid = i << stride2;
|
||||
if is_anchored[i] {
|
||||
for next in dfa.trans[sid..][..stride].iter_mut() {
|
||||
*next = remap_anchored[*next];
|
||||
}
|
||||
} else {
|
||||
for next in dfa.trans[sid..][..stride].iter_mut() {
|
||||
*next = remap_unanchored[*next];
|
||||
}
|
||||
}
|
||||
}
|
||||
// Now that we've remapped all the IDs in our states, all that's left
|
||||
// is remapping the special state IDs.
|
||||
let old = nnfa.special();
|
||||
let new = &mut dfa.special;
|
||||
new.max_special_id = remap_anchored[old.max_special_id];
|
||||
new.max_match_id = remap_anchored[old.max_match_id];
|
||||
new.start_unanchored_id = remap_unanchored[old.start_unanchored_id];
|
||||
new.start_anchored_id = remap_anchored[old.start_anchored_id];
|
||||
}
|
||||
|
||||
/// Set the desired match semantics.
|
||||
///
|
||||
/// This only applies when using [`Builder::build`] and not
|
||||
/// [`Builder::build_from_noncontiguous`].
|
||||
///
|
||||
/// See
|
||||
/// [`AhoCorasickBuilder::match_kind`](crate::AhoCorasickBuilder::match_kind)
|
||||
/// for more documentation and examples.
|
||||
pub fn match_kind(&mut self, kind: MatchKind) -> &mut Builder {
|
||||
self.noncontiguous.match_kind(kind);
|
||||
self
|
||||
}
|
||||
|
||||
/// Enable ASCII-aware case insensitive matching.
|
||||
///
|
||||
/// This only applies when using [`Builder::build`] and not
|
||||
/// [`Builder::build_from_noncontiguous`].
|
||||
///
|
||||
/// See
|
||||
/// [`AhoCorasickBuilder::ascii_case_insensitive`](crate::AhoCorasickBuilder::ascii_case_insensitive)
|
||||
/// for more documentation and examples.
|
||||
pub fn ascii_case_insensitive(&mut self, yes: bool) -> &mut Builder {
|
||||
self.noncontiguous.ascii_case_insensitive(yes);
|
||||
self
|
||||
}
|
||||
|
||||
/// Enable heuristic prefilter optimizations.
|
||||
///
|
||||
/// This only applies when using [`Builder::build`] and not
|
||||
/// [`Builder::build_from_noncontiguous`].
|
||||
///
|
||||
/// See
|
||||
/// [`AhoCorasickBuilder::prefilter`](crate::AhoCorasickBuilder::prefilter)
|
||||
/// for more documentation and examples.
|
||||
pub fn prefilter(&mut self, yes: bool) -> &mut Builder {
|
||||
self.noncontiguous.prefilter(yes);
|
||||
self
|
||||
}
|
||||
|
||||
/// Sets the starting state configuration for the automaton.
|
||||
///
|
||||
/// See
|
||||
/// [`AhoCorasickBuilder::start_kind`](crate::AhoCorasickBuilder::start_kind)
|
||||
/// for more documentation and examples.
|
||||
pub fn start_kind(&mut self, kind: StartKind) -> &mut Builder {
|
||||
self.start_kind = kind;
|
||||
self
|
||||
}
|
||||
|
||||
/// A debug setting for whether to attempt to shrink the size of the
|
||||
/// automaton's alphabet or not.
|
||||
///
|
||||
/// This should never be enabled unless you're debugging an automaton.
|
||||
/// Namely, disabling byte classes makes transitions easier to reason
|
||||
/// about, since they use the actual bytes instead of equivalence classes.
|
||||
/// Disabling this confers no performance benefit at search time.
|
||||
///
|
||||
/// See
|
||||
/// [`AhoCorasickBuilder::byte_classes`](crate::AhoCorasickBuilder::byte_classes)
|
||||
/// for more documentation and examples.
|
||||
pub fn byte_classes(&mut self, yes: bool) -> &mut Builder {
|
||||
self.byte_classes = yes;
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
/// Iterate over all possible equivalence class transitions in this state.
|
||||
/// The closure is called for all transitions with a distinct equivalence
|
||||
/// class, even those not explicitly represented in this sparse state. For
|
||||
/// any implicitly defined transitions, the given closure is called with
|
||||
/// the fail state ID.
|
||||
///
|
||||
/// The closure is guaranteed to be called precisely
|
||||
/// `byte_classes.alphabet_len()` times, once for every possible class in
|
||||
/// ascending order.
|
||||
fn sparse_iter<F: FnMut(u8, u8, StateID)>(
|
||||
nnfa: &noncontiguous::NFA,
|
||||
oldsid: StateID,
|
||||
classes: &ByteClasses,
|
||||
mut f: F,
|
||||
) {
|
||||
let mut prev_class = None;
|
||||
let mut byte = 0usize;
|
||||
for t in nnfa.iter_trans(oldsid) {
|
||||
while byte < usize::from(t.byte()) {
|
||||
let rep = byte.as_u8();
|
||||
let class = classes.get(rep);
|
||||
byte += 1;
|
||||
if prev_class != Some(class) {
|
||||
f(rep, class, noncontiguous::NFA::FAIL);
|
||||
prev_class = Some(class);
|
||||
}
|
||||
}
|
||||
let rep = t.byte();
|
||||
let class = classes.get(rep);
|
||||
byte += 1;
|
||||
if prev_class != Some(class) {
|
||||
f(rep, class, t.next());
|
||||
prev_class = Some(class);
|
||||
}
|
||||
}
|
||||
for b in byte..=255 {
|
||||
let rep = b.as_u8();
|
||||
let class = classes.get(rep);
|
||||
if prev_class != Some(class) {
|
||||
f(rep, class, noncontiguous::NFA::FAIL);
|
||||
prev_class = Some(class);
|
||||
}
|
||||
}
|
||||
}
|
326
vendor/aho-corasick/src/lib.rs
vendored
Normal file
326
vendor/aho-corasick/src/lib.rs
vendored
Normal file
@ -0,0 +1,326 @@
|
||||
/*!
|
||||
A library for finding occurrences of many patterns at once. This library
|
||||
provides multiple pattern search principally through an implementation of the
|
||||
[Aho-Corasick algorithm](https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm),
|
||||
which builds a fast finite state machine for executing searches in linear time.
|
||||
|
||||
Additionally, this library provides a number of configuration options for
|
||||
building the automaton that permit controlling the space versus time trade
|
||||
off. Other features include simple ASCII case insensitive matching, finding
|
||||
overlapping matches, replacements, searching streams and even searching and
|
||||
replacing text in streams.
|
||||
|
||||
Finally, unlike most other Aho-Corasick implementations, this one
|
||||
supports enabling [leftmost-first](MatchKind::LeftmostFirst) or
|
||||
[leftmost-longest](MatchKind::LeftmostLongest) match semantics, using a
|
||||
(seemingly) novel alternative construction algorithm. For more details on what
|
||||
match semantics means, see the [`MatchKind`] type.
|
||||
|
||||
# Overview
|
||||
|
||||
This section gives a brief overview of the primary types in this crate:
|
||||
|
||||
* [`AhoCorasick`] is the primary type and represents an Aho-Corasick automaton.
|
||||
This is the type you use to execute searches.
|
||||
* [`AhoCorasickBuilder`] can be used to build an Aho-Corasick automaton, and
|
||||
supports configuring a number of options.
|
||||
* [`Match`] represents a single match reported by an Aho-Corasick automaton.
|
||||
Each match has two pieces of information: the pattern that matched and the
|
||||
start and end byte offsets corresponding to the position in the haystack at
|
||||
which it matched.
|
||||
|
||||
# Example: basic searching
|
||||
|
||||
This example shows how to search for occurrences of multiple patterns
|
||||
simultaneously. Each match includes the pattern that matched along with the
|
||||
byte offsets of the match.
|
||||
|
||||
```
|
||||
use aho_corasick::{AhoCorasick, PatternID};
|
||||
|
||||
let patterns = &["apple", "maple", "Snapple"];
|
||||
let haystack = "Nobody likes maple in their apple flavored Snapple.";
|
||||
|
||||
let ac = AhoCorasick::new(patterns).unwrap();
|
||||
let mut matches = vec![];
|
||||
for mat in ac.find_iter(haystack) {
|
||||
matches.push((mat.pattern(), mat.start(), mat.end()));
|
||||
}
|
||||
assert_eq!(matches, vec![
|
||||
(PatternID::must(1), 13, 18),
|
||||
(PatternID::must(0), 28, 33),
|
||||
(PatternID::must(2), 43, 50),
|
||||
]);
|
||||
```
|
||||
|
||||
# Example: case insensitivity
|
||||
|
||||
This is like the previous example, but matches `Snapple` case insensitively
|
||||
using `AhoCorasickBuilder`:
|
||||
|
||||
```
|
||||
use aho_corasick::{AhoCorasick, PatternID};
|
||||
|
||||
let patterns = &["apple", "maple", "snapple"];
|
||||
let haystack = "Nobody likes maple in their apple flavored Snapple.";
|
||||
|
||||
let ac = AhoCorasick::builder()
|
||||
.ascii_case_insensitive(true)
|
||||
.build(patterns)
|
||||
.unwrap();
|
||||
let mut matches = vec![];
|
||||
for mat in ac.find_iter(haystack) {
|
||||
matches.push((mat.pattern(), mat.start(), mat.end()));
|
||||
}
|
||||
assert_eq!(matches, vec![
|
||||
(PatternID::must(1), 13, 18),
|
||||
(PatternID::must(0), 28, 33),
|
||||
(PatternID::must(2), 43, 50),
|
||||
]);
|
||||
```
|
||||
|
||||
# Example: replacing matches in a stream
|
||||
|
||||
This example shows how to execute a search and replace on a stream without
|
||||
loading the entire stream into memory first.
|
||||
|
||||
```
|
||||
# #[cfg(feature = "std")] {
|
||||
use aho_corasick::AhoCorasick;
|
||||
|
||||
# fn example() -> Result<(), std::io::Error> {
|
||||
let patterns = &["fox", "brown", "quick"];
|
||||
let replace_with = &["sloth", "grey", "slow"];
|
||||
|
||||
// In a real example, these might be `std::fs::File`s instead. All you need to
|
||||
// do is supply a pair of `std::io::Read` and `std::io::Write` implementations.
|
||||
let rdr = "The quick brown fox.";
|
||||
let mut wtr = vec![];
|
||||
|
||||
let ac = AhoCorasick::new(patterns).unwrap();
|
||||
ac.try_stream_replace_all(rdr.as_bytes(), &mut wtr, replace_with)?;
|
||||
assert_eq!(b"The slow grey sloth.".to_vec(), wtr);
|
||||
# Ok(()) }; example().unwrap()
|
||||
# }
|
||||
```
|
||||
|
||||
# Example: finding the leftmost first match
|
||||
|
||||
In the textbook description of Aho-Corasick, its formulation is typically
|
||||
structured such that it reports all possible matches, even when they overlap
|
||||
with another. In many cases, overlapping matches may not be desired, such as
|
||||
the case of finding all successive non-overlapping matches like you might with
|
||||
a standard regular expression.
|
||||
|
||||
Unfortunately the "obvious" way to modify the Aho-Corasick algorithm to do
|
||||
this doesn't always work in the expected way, since it will report matches as
|
||||
soon as they are seen. For example, consider matching the regex `Samwise|Sam`
|
||||
against the text `Samwise`. Most regex engines (that are Perl-like, or
|
||||
non-POSIX) will report `Samwise` as a match, but the standard Aho-Corasick
|
||||
algorithm modified for reporting non-overlapping matches will report `Sam`.
|
||||
|
||||
A novel contribution of this library is the ability to change the match
|
||||
semantics of Aho-Corasick (without additional search time overhead) such that
|
||||
`Samwise` is reported instead. For example, here's the standard approach:
|
||||
|
||||
```
|
||||
use aho_corasick::AhoCorasick;
|
||||
|
||||
let patterns = &["Samwise", "Sam"];
|
||||
let haystack = "Samwise";
|
||||
|
||||
let ac = AhoCorasick::new(patterns).unwrap();
|
||||
let mat = ac.find(haystack).expect("should have a match");
|
||||
assert_eq!("Sam", &haystack[mat.start()..mat.end()]);
|
||||
```
|
||||
|
||||
And now here's the leftmost-first version, which matches how a Perl-like
|
||||
regex will work:
|
||||
|
||||
```
|
||||
use aho_corasick::{AhoCorasick, MatchKind};
|
||||
|
||||
let patterns = &["Samwise", "Sam"];
|
||||
let haystack = "Samwise";
|
||||
|
||||
let ac = AhoCorasick::builder()
|
||||
.match_kind(MatchKind::LeftmostFirst)
|
||||
.build(patterns)
|
||||
.unwrap();
|
||||
let mat = ac.find(haystack).expect("should have a match");
|
||||
assert_eq!("Samwise", &haystack[mat.start()..mat.end()]);
|
||||
```
|
||||
|
||||
In addition to leftmost-first semantics, this library also supports
|
||||
leftmost-longest semantics, which match the POSIX behavior of a regular
|
||||
expression alternation. See [`MatchKind`] for more details.
|
||||
|
||||
# Prefilters
|
||||
|
||||
While an Aho-Corasick automaton can perform admirably when compared to more
|
||||
naive solutions, it is generally slower than more specialized algorithms that
|
||||
are accelerated using vector instructions such as SIMD.
|
||||
|
||||
For that reason, this library will internally use a "prefilter" to attempt
|
||||
to accelerate searches when possible. Currently, this library has several
|
||||
different algorithms it might use depending on the patterns provided. Once the
|
||||
number of patterns gets too big, prefilters are no longer used.
|
||||
|
||||
While a prefilter is generally good to have on by default since it works
|
||||
well in the common case, it can lead to less predictable or even sub-optimal
|
||||
performance in some cases. For that reason, prefilters can be explicitly
|
||||
disabled via [`AhoCorasickBuilder::prefilter`].
|
||||
|
||||
# Lower level APIs
|
||||
|
||||
This crate also provides several sub-modules that collectively expose many of
|
||||
the implementation details of the main [`AhoCorasick`] type. Most users of this
|
||||
library can completely ignore the submodules and their contents, but if you
|
||||
needed finer grained control, some parts of them may be useful to you. Here is
|
||||
a brief overview of each and why you might want to use them:
|
||||
|
||||
* The [`packed`] sub-module contains a lower level API for using fast
|
||||
vectorized routines for finding a small number of patterns in a haystack.
|
||||
You might want to use this API when you want to completely side-step using
|
||||
Aho-Corasick automata. Otherwise, the fast vectorized routines are used
|
||||
automatically as prefilters for `AhoCorasick` searches whenever possible.
|
||||
* The [`automaton`] sub-module provides a lower level finite state
|
||||
machine interface that the various Aho-Corasick implementations in
|
||||
this crate implement. This sub-module's main contribution is the
|
||||
[`Automaton`](automaton::Automaton) trait, which permits manually walking the
|
||||
state transitions of an Aho-Corasick automaton.
|
||||
* The [`dfa`] and [`nfa`] sub-modules provide DFA and NFA implementations of
|
||||
the aforementioned `Automaton` trait. The main reason one might want to use
|
||||
these sub-modules is to get access to a type that implements the `Automaton`
|
||||
trait. (The top-level `AhoCorasick` type does not implement the `Automaton`
|
||||
trait.)
|
||||
|
||||
As mentioned above, if you aren't sure whether you need these sub-modules,
|
||||
you should be able to safely ignore them and just focus on the [`AhoCorasick`]
|
||||
type.
|
||||
|
||||
# Crate features
|
||||
|
||||
This crate exposes a few features for controlling dependency usage and whether
|
||||
this crate can be used without the standard library.
|
||||
|
||||
* **std** -
|
||||
Enables support for the standard library. This feature is enabled by
|
||||
default. When disabled, only `core` and `alloc` are used. At an API
|
||||
level, enabling `std` enables `std::error::Error` trait impls for the
|
||||
various error types, and higher level stream search routines such as
|
||||
[`AhoCorasick::try_stream_find_iter`]. But the `std` feature is also required
|
||||
to enable vectorized prefilters. Prefilters can greatly accelerate searches,
|
||||
but generally only apply when the number of patterns is small (less than
|
||||
~100).
|
||||
* **perf-literal** -
|
||||
Enables support for literal prefilters that use vectorized routines from
|
||||
external crates. This feature is enabled by default. If you're only using
|
||||
Aho-Corasick for large numbers of patterns or otherwise can abide lower
|
||||
throughput when searching with a small number of patterns, then it is
|
||||
reasonable to disable this feature.
|
||||
* **logging** -
|
||||
Enables a dependency on the `log` crate and emits messages to aide in
|
||||
diagnostics. This feature is disabled by default.
|
||||
*/
|
||||
|
||||
#![no_std]
|
||||
#![deny(missing_docs)]
|
||||
#![deny(rustdoc::broken_intra_doc_links)]
|
||||
#![cfg_attr(docsrs, feature(doc_auto_cfg))]
|
||||
|
||||
extern crate alloc;
|
||||
#[cfg(any(test, feature = "std"))]
|
||||
extern crate std;
|
||||
|
||||
#[cfg(doctest)]
|
||||
doc_comment::doctest!("../README.md");
|
||||
|
||||
#[cfg(feature = "std")]
|
||||
pub use crate::ahocorasick::StreamFindIter;
|
||||
pub use crate::{
|
||||
ahocorasick::{
|
||||
AhoCorasick, AhoCorasickBuilder, AhoCorasickKind, FindIter,
|
||||
FindOverlappingIter,
|
||||
},
|
||||
util::{
|
||||
error::{BuildError, MatchError, MatchErrorKind},
|
||||
primitives::{PatternID, PatternIDError},
|
||||
search::{Anchored, Input, Match, MatchKind, Span, StartKind},
|
||||
},
|
||||
};
|
||||
|
||||
#[macro_use]
|
||||
mod macros;
|
||||
|
||||
mod ahocorasick;
|
||||
pub mod automaton;
|
||||
pub mod dfa;
|
||||
pub mod nfa;
|
||||
pub mod packed;
|
||||
#[cfg(test)]
|
||||
mod tests;
|
||||
// I wrote out the module for implementing fst::Automaton only to later realize
|
||||
// that this would make fst a public dependency and fst is not at 1.0 yet. I
|
||||
// decided to just keep the code in tree, but build it only during tests.
|
||||
//
|
||||
// TODO: I think I've changed my mind again. I'm considering pushing it out
|
||||
// into either a separate crate or into 'fst' directly as an optional feature.
|
||||
// #[cfg(test)]
|
||||
// #[allow(dead_code)]
|
||||
// mod transducer;
|
||||
pub(crate) mod util;
|
||||
|
||||
#[cfg(test)]
|
||||
mod testoibits {
|
||||
use std::panic::{RefUnwindSafe, UnwindSafe};
|
||||
|
||||
use super::*;
|
||||
|
||||
fn assert_all<T: Send + Sync + UnwindSafe + RefUnwindSafe>() {}
|
||||
|
||||
#[test]
|
||||
fn oibits_main() {
|
||||
assert_all::<AhoCorasick>();
|
||||
assert_all::<AhoCorasickBuilder>();
|
||||
assert_all::<AhoCorasickKind>();
|
||||
assert_all::<FindIter>();
|
||||
assert_all::<FindOverlappingIter>();
|
||||
|
||||
assert_all::<BuildError>();
|
||||
assert_all::<MatchError>();
|
||||
assert_all::<MatchErrorKind>();
|
||||
|
||||
assert_all::<Anchored>();
|
||||
assert_all::<Input>();
|
||||
assert_all::<Match>();
|
||||
assert_all::<MatchKind>();
|
||||
assert_all::<Span>();
|
||||
assert_all::<StartKind>();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn oibits_automaton() {
|
||||
use crate::{automaton, dfa::DFA};
|
||||
|
||||
assert_all::<automaton::FindIter<DFA>>();
|
||||
assert_all::<automaton::FindOverlappingIter<DFA>>();
|
||||
#[cfg(feature = "std")]
|
||||
assert_all::<automaton::StreamFindIter<DFA, std::io::Stdin>>();
|
||||
assert_all::<automaton::OverlappingState>();
|
||||
|
||||
assert_all::<automaton::Prefilter>();
|
||||
assert_all::<automaton::Candidate>();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn oibits_packed() {
|
||||
use crate::packed;
|
||||
|
||||
assert_all::<packed::Config>();
|
||||
assert_all::<packed::Builder>();
|
||||
assert_all::<packed::Searcher>();
|
||||
assert_all::<packed::FindIter>();
|
||||
assert_all::<packed::MatchKind>();
|
||||
}
|
||||
}
|
18
vendor/aho-corasick/src/macros.rs
vendored
Normal file
18
vendor/aho-corasick/src/macros.rs
vendored
Normal file
@ -0,0 +1,18 @@
|
||||
#![allow(unused_macros)]
|
||||
|
||||
macro_rules! log {
|
||||
($($tt:tt)*) => {
|
||||
#[cfg(feature = "logging")]
|
||||
{
|
||||
$($tt)*
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
macro_rules! debug {
|
||||
($($tt:tt)*) => { log!(log::debug!($($tt)*)) }
|
||||
}
|
||||
|
||||
macro_rules! trace {
|
||||
($($tt:tt)*) => { log!(log::trace!($($tt)*)) }
|
||||
}
|
1141
vendor/aho-corasick/src/nfa/contiguous.rs
vendored
Normal file
1141
vendor/aho-corasick/src/nfa/contiguous.rs
vendored
Normal file
File diff suppressed because it is too large
Load Diff
40
vendor/aho-corasick/src/nfa/mod.rs
vendored
Normal file
40
vendor/aho-corasick/src/nfa/mod.rs
vendored
Normal file
@ -0,0 +1,40 @@
|
||||
/*!
|
||||
Provides direct access to NFA implementations of Aho-Corasick.
|
||||
|
||||
The principle characteristic of an NFA in this crate is that it may
|
||||
transition through multiple states per byte of haystack. In Aho-Corasick
|
||||
parlance, NFAs follow failure transitions during a search. In contrast,
|
||||
a [`DFA`](crate::dfa::DFA) pre-computes all failure transitions during
|
||||
compilation at the expense of a much bigger memory footprint.
|
||||
|
||||
Currently, there are two NFA implementations provided: noncontiguous and
|
||||
contiguous. The names reflect their internal representation, and consequently,
|
||||
the trade offs associated with them:
|
||||
|
||||
* A [`noncontiguous::NFA`] uses a separate allocation for every NFA state to
|
||||
represent its transitions in a sparse format. This is ideal for building an
|
||||
NFA, since it cheaply permits different states to have a different number of
|
||||
transitions. A noncontiguous NFA is where the main Aho-Corasick construction
|
||||
algorithm is implemented. All other Aho-Corasick implementations are built by
|
||||
first constructing a noncontiguous NFA.
|
||||
* A [`contiguous::NFA`] is uses a single allocation to represent all states,
|
||||
while still encoding most states as sparse states but permitting states near
|
||||
the starting state to have a dense representation. The dense representation
|
||||
uses more memory, but permits computing transitions during a search more
|
||||
quickly. By only making the most active states dense (the states near the
|
||||
starting state), a contiguous NFA better balances memory usage with search
|
||||
speed. The single contiguous allocation also uses less overhead per state and
|
||||
enables compression tricks where most states only use 8 bytes of heap memory.
|
||||
|
||||
When given the choice between these two, you almost always want to pick a
|
||||
contiguous NFA. It takes only a little longer to build, but both its memory
|
||||
usage and search speed are typically much better than a noncontiguous NFA. A
|
||||
noncontiguous NFA is useful when prioritizing build times, or when there are
|
||||
so many patterns that a contiguous NFA could not be built. (Currently, because
|
||||
of both memory and search speed improvements, a contiguous NFA has a smaller
|
||||
internal limit on the total number of NFA states it can represent. But you
|
||||
would likely need to have hundreds of thousands or even millions of patterns
|
||||
before you hit this limit.)
|
||||
*/
|
||||
pub mod contiguous;
|
||||
pub mod noncontiguous;
|
1762
vendor/aho-corasick/src/nfa/noncontiguous.rs
vendored
Normal file
1762
vendor/aho-corasick/src/nfa/noncontiguous.rs
vendored
Normal file
File diff suppressed because it is too large
Load Diff
687
vendor/aho-corasick/src/packed/api.rs
vendored
Normal file
687
vendor/aho-corasick/src/packed/api.rs
vendored
Normal file
@ -0,0 +1,687 @@
|
||||
use alloc::sync::Arc;
|
||||
|
||||
use crate::{
|
||||
packed::{pattern::Patterns, rabinkarp::RabinKarp, teddy},
|
||||
util::search::{Match, Span},
|
||||
};
|
||||
|
||||
/// This is a limit placed on the total number of patterns we're willing to try
|
||||
/// and match at once. As more sophisticated algorithms are added, this number
|
||||
/// may be increased.
|
||||
const PATTERN_LIMIT: usize = 128;
|
||||
|
||||
/// A knob for controlling the match semantics of a packed multiple string
|
||||
/// searcher.
|
||||
///
|
||||
/// This differs from the [`MatchKind`](crate::MatchKind) type in the top-level
|
||||
/// crate module in that it doesn't support "standard" match semantics,
|
||||
/// and instead only supports leftmost-first or leftmost-longest. Namely,
|
||||
/// "standard" semantics cannot be easily supported by packed searchers.
|
||||
///
|
||||
/// For more information on the distinction between leftmost-first and
|
||||
/// leftmost-longest, see the docs on the top-level `MatchKind` type.
|
||||
///
|
||||
/// Unlike the top-level `MatchKind` type, the default match semantics for this
|
||||
/// type are leftmost-first.
|
||||
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
|
||||
#[non_exhaustive]
|
||||
pub enum MatchKind {
|
||||
/// Use leftmost-first match semantics, which reports leftmost matches.
|
||||
/// When there are multiple possible leftmost matches, the match
|
||||
/// corresponding to the pattern that appeared earlier when constructing
|
||||
/// the automaton is reported.
|
||||
///
|
||||
/// This is the default.
|
||||
LeftmostFirst,
|
||||
/// Use leftmost-longest match semantics, which reports leftmost matches.
|
||||
/// When there are multiple possible leftmost matches, the longest match
|
||||
/// is chosen.
|
||||
LeftmostLongest,
|
||||
}
|
||||
|
||||
impl Default for MatchKind {
|
||||
fn default() -> MatchKind {
|
||||
MatchKind::LeftmostFirst
|
||||
}
|
||||
}
|
||||
|
||||
/// The configuration for a packed multiple pattern searcher.
|
||||
///
|
||||
/// The configuration is currently limited only to being able to select the
|
||||
/// match semantics (leftmost-first or leftmost-longest) of a searcher. In the
|
||||
/// future, more knobs may be made available.
|
||||
///
|
||||
/// A configuration produces a [`packed::Builder`](Builder), which in turn can
|
||||
/// be used to construct a [`packed::Searcher`](Searcher) for searching.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// This example shows how to use leftmost-longest semantics instead of the
|
||||
/// default (leftmost-first).
|
||||
///
|
||||
/// ```
|
||||
/// use aho_corasick::{packed::{Config, MatchKind}, PatternID};
|
||||
///
|
||||
/// # fn example() -> Option<()> {
|
||||
/// let searcher = Config::new()
|
||||
/// .match_kind(MatchKind::LeftmostLongest)
|
||||
/// .builder()
|
||||
/// .add("foo")
|
||||
/// .add("foobar")
|
||||
/// .build()?;
|
||||
/// let matches: Vec<PatternID> = searcher
|
||||
/// .find_iter("foobar")
|
||||
/// .map(|mat| mat.pattern())
|
||||
/// .collect();
|
||||
/// assert_eq!(vec![PatternID::must(1)], matches);
|
||||
/// # Some(()) }
|
||||
/// # if cfg!(all(feature = "std", any(
|
||||
/// # target_arch = "x86_64", target_arch = "aarch64",
|
||||
/// # ))) {
|
||||
/// # example().unwrap()
|
||||
/// # } else {
|
||||
/// # assert!(example().is_none());
|
||||
/// # }
|
||||
/// ```
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct Config {
|
||||
kind: MatchKind,
|
||||
force: Option<ForceAlgorithm>,
|
||||
only_teddy_fat: Option<bool>,
|
||||
only_teddy_256bit: Option<bool>,
|
||||
heuristic_pattern_limits: bool,
|
||||
}
|
||||
|
||||
/// An internal option for forcing the use of a particular packed algorithm.
|
||||
///
|
||||
/// When an algorithm is forced, if a searcher could not be constructed for it,
|
||||
/// then no searcher will be returned even if an alternative algorithm would
|
||||
/// work.
|
||||
#[derive(Clone, Debug)]
|
||||
enum ForceAlgorithm {
|
||||
Teddy,
|
||||
RabinKarp,
|
||||
}
|
||||
|
||||
impl Default for Config {
|
||||
fn default() -> Config {
|
||||
Config::new()
|
||||
}
|
||||
}
|
||||
|
||||
impl Config {
|
||||
/// Create a new default configuration. A default configuration uses
|
||||
/// leftmost-first match semantics.
|
||||
pub fn new() -> Config {
|
||||
Config {
|
||||
kind: MatchKind::LeftmostFirst,
|
||||
force: None,
|
||||
only_teddy_fat: None,
|
||||
only_teddy_256bit: None,
|
||||
heuristic_pattern_limits: true,
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a packed builder from this configuration. The builder can be
|
||||
/// used to accumulate patterns and create a [`Searcher`] from them.
|
||||
pub fn builder(&self) -> Builder {
|
||||
Builder::from_config(self.clone())
|
||||
}
|
||||
|
||||
/// Set the match semantics for this configuration.
|
||||
pub fn match_kind(&mut self, kind: MatchKind) -> &mut Config {
|
||||
self.kind = kind;
|
||||
self
|
||||
}
|
||||
|
||||
/// An undocumented method for forcing the use of the Teddy algorithm.
|
||||
///
|
||||
/// This is only exposed for more precise testing and benchmarks. Callers
|
||||
/// should not use it as it is not part of the API stability guarantees of
|
||||
/// this crate.
|
||||
#[doc(hidden)]
|
||||
pub fn only_teddy(&mut self, yes: bool) -> &mut Config {
|
||||
if yes {
|
||||
self.force = Some(ForceAlgorithm::Teddy);
|
||||
} else {
|
||||
self.force = None;
|
||||
}
|
||||
self
|
||||
}
|
||||
|
||||
/// An undocumented method for forcing the use of the Fat Teddy algorithm.
|
||||
///
|
||||
/// This is only exposed for more precise testing and benchmarks. Callers
|
||||
/// should not use it as it is not part of the API stability guarantees of
|
||||
/// this crate.
|
||||
#[doc(hidden)]
|
||||
pub fn only_teddy_fat(&mut self, yes: Option<bool>) -> &mut Config {
|
||||
self.only_teddy_fat = yes;
|
||||
self
|
||||
}
|
||||
|
||||
/// An undocumented method for forcing the use of SSE (`Some(false)`) or
|
||||
/// AVX (`Some(true)`) algorithms.
|
||||
///
|
||||
/// This is only exposed for more precise testing and benchmarks. Callers
|
||||
/// should not use it as it is not part of the API stability guarantees of
|
||||
/// this crate.
|
||||
#[doc(hidden)]
|
||||
pub fn only_teddy_256bit(&mut self, yes: Option<bool>) -> &mut Config {
|
||||
self.only_teddy_256bit = yes;
|
||||
self
|
||||
}
|
||||
|
||||
/// An undocumented method for forcing the use of the Rabin-Karp algorithm.
|
||||
///
|
||||
/// This is only exposed for more precise testing and benchmarks. Callers
|
||||
/// should not use it as it is not part of the API stability guarantees of
|
||||
/// this crate.
|
||||
#[doc(hidden)]
|
||||
pub fn only_rabin_karp(&mut self, yes: bool) -> &mut Config {
|
||||
if yes {
|
||||
self.force = Some(ForceAlgorithm::RabinKarp);
|
||||
} else {
|
||||
self.force = None;
|
||||
}
|
||||
self
|
||||
}
|
||||
|
||||
/// Request that heuristic limitations on the number of patterns be
|
||||
/// employed. This useful to disable for benchmarking where one wants to
|
||||
/// explore how Teddy performs on large number of patterns even if the
|
||||
/// heuristics would otherwise refuse construction.
|
||||
///
|
||||
/// This is enabled by default.
|
||||
pub fn heuristic_pattern_limits(&mut self, yes: bool) -> &mut Config {
|
||||
self.heuristic_pattern_limits = yes;
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
/// A builder for constructing a packed searcher from a collection of patterns.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// This example shows how to use a builder to construct a searcher. By
|
||||
/// default, leftmost-first match semantics are used.
|
||||
///
|
||||
/// ```
|
||||
/// use aho_corasick::{packed::{Builder, MatchKind}, PatternID};
|
||||
///
|
||||
/// # fn example() -> Option<()> {
|
||||
/// let searcher = Builder::new()
|
||||
/// .add("foobar")
|
||||
/// .add("foo")
|
||||
/// .build()?;
|
||||
/// let matches: Vec<PatternID> = searcher
|
||||
/// .find_iter("foobar")
|
||||
/// .map(|mat| mat.pattern())
|
||||
/// .collect();
|
||||
/// assert_eq!(vec![PatternID::ZERO], matches);
|
||||
/// # Some(()) }
|
||||
/// # if cfg!(all(feature = "std", any(
|
||||
/// # target_arch = "x86_64", target_arch = "aarch64",
|
||||
/// # ))) {
|
||||
/// # example().unwrap()
|
||||
/// # } else {
|
||||
/// # assert!(example().is_none());
|
||||
/// # }
|
||||
/// ```
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct Builder {
|
||||
/// The configuration of this builder and subsequent matcher.
|
||||
config: Config,
|
||||
/// Set to true if the builder detects that a matcher cannot be built.
|
||||
inert: bool,
|
||||
/// The patterns provided by the caller.
|
||||
patterns: Patterns,
|
||||
}
|
||||
|
||||
impl Builder {
|
||||
/// Create a new builder for constructing a multi-pattern searcher. This
|
||||
/// constructor uses the default configuration.
|
||||
pub fn new() -> Builder {
|
||||
Builder::from_config(Config::new())
|
||||
}
|
||||
|
||||
fn from_config(config: Config) -> Builder {
|
||||
Builder { config, inert: false, patterns: Patterns::new() }
|
||||
}
|
||||
|
||||
/// Build a searcher from the patterns added to this builder so far.
|
||||
pub fn build(&self) -> Option<Searcher> {
|
||||
if self.inert || self.patterns.is_empty() {
|
||||
return None;
|
||||
}
|
||||
let mut patterns = self.patterns.clone();
|
||||
patterns.set_match_kind(self.config.kind);
|
||||
let patterns = Arc::new(patterns);
|
||||
let rabinkarp = RabinKarp::new(&patterns);
|
||||
// Effectively, we only want to return a searcher if we can use Teddy,
|
||||
// since Teddy is our only fast packed searcher at the moment.
|
||||
// Rabin-Karp is only used when searching haystacks smaller than what
|
||||
// Teddy can support. Thus, the only way to get a Rabin-Karp searcher
|
||||
// is to force it using undocumented APIs (for tests/benchmarks).
|
||||
let (search_kind, minimum_len) = match self.config.force {
|
||||
None | Some(ForceAlgorithm::Teddy) => {
|
||||
debug!("trying to build Teddy packed matcher");
|
||||
let teddy = match self.build_teddy(Arc::clone(&patterns)) {
|
||||
None => return None,
|
||||
Some(teddy) => teddy,
|
||||
};
|
||||
let minimum_len = teddy.minimum_len();
|
||||
(SearchKind::Teddy(teddy), minimum_len)
|
||||
}
|
||||
Some(ForceAlgorithm::RabinKarp) => {
|
||||
debug!("using Rabin-Karp packed matcher");
|
||||
(SearchKind::RabinKarp, 0)
|
||||
}
|
||||
};
|
||||
Some(Searcher { patterns, rabinkarp, search_kind, minimum_len })
|
||||
}
|
||||
|
||||
fn build_teddy(&self, patterns: Arc<Patterns>) -> Option<teddy::Searcher> {
|
||||
teddy::Builder::new()
|
||||
.only_256bit(self.config.only_teddy_256bit)
|
||||
.only_fat(self.config.only_teddy_fat)
|
||||
.heuristic_pattern_limits(self.config.heuristic_pattern_limits)
|
||||
.build(patterns)
|
||||
}
|
||||
|
||||
/// Add the given pattern to this set to match.
|
||||
///
|
||||
/// The order in which patterns are added is significant. Namely, when
|
||||
/// using leftmost-first match semantics, then when multiple patterns can
|
||||
/// match at a particular location, the pattern that was added first is
|
||||
/// used as the match.
|
||||
///
|
||||
/// If the number of patterns added exceeds the amount supported by packed
|
||||
/// searchers, then the builder will stop accumulating patterns and render
|
||||
/// itself inert. At this point, constructing a searcher will always return
|
||||
/// `None`.
|
||||
pub fn add<P: AsRef<[u8]>>(&mut self, pattern: P) -> &mut Builder {
|
||||
if self.inert {
|
||||
return self;
|
||||
} else if self.patterns.len() >= PATTERN_LIMIT {
|
||||
self.inert = true;
|
||||
self.patterns.reset();
|
||||
return self;
|
||||
}
|
||||
// Just in case PATTERN_LIMIT increases beyond u16::MAX.
|
||||
assert!(self.patterns.len() <= core::u16::MAX as usize);
|
||||
|
||||
let pattern = pattern.as_ref();
|
||||
if pattern.is_empty() {
|
||||
self.inert = true;
|
||||
self.patterns.reset();
|
||||
return self;
|
||||
}
|
||||
self.patterns.add(pattern);
|
||||
self
|
||||
}
|
||||
|
||||
/// Add the given iterator of patterns to this set to match.
|
||||
///
|
||||
/// The iterator must yield elements that can be converted into a `&[u8]`.
|
||||
///
|
||||
/// The order in which patterns are added is significant. Namely, when
|
||||
/// using leftmost-first match semantics, then when multiple patterns can
|
||||
/// match at a particular location, the pattern that was added first is
|
||||
/// used as the match.
|
||||
///
|
||||
/// If the number of patterns added exceeds the amount supported by packed
|
||||
/// searchers, then the builder will stop accumulating patterns and render
|
||||
/// itself inert. At this point, constructing a searcher will always return
|
||||
/// `None`.
|
||||
pub fn extend<I, P>(&mut self, patterns: I) -> &mut Builder
|
||||
where
|
||||
I: IntoIterator<Item = P>,
|
||||
P: AsRef<[u8]>,
|
||||
{
|
||||
for p in patterns {
|
||||
self.add(p);
|
||||
}
|
||||
self
|
||||
}
|
||||
|
||||
/// Returns the number of patterns added to this builder.
|
||||
pub fn len(&self) -> usize {
|
||||
self.patterns.len()
|
||||
}
|
||||
|
||||
/// Returns the length, in bytes, of the shortest pattern added.
|
||||
pub fn minimum_len(&self) -> usize {
|
||||
self.patterns.minimum_len()
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for Builder {
|
||||
fn default() -> Builder {
|
||||
Builder::new()
|
||||
}
|
||||
}
|
||||
|
||||
/// A packed searcher for quickly finding occurrences of multiple patterns.
|
||||
///
|
||||
/// If callers need more flexible construction, or if one wants to change the
|
||||
/// match semantics (either leftmost-first or leftmost-longest), then one can
|
||||
/// use the [`Config`] and/or [`Builder`] types for more fine grained control.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// This example shows how to create a searcher from an iterator of patterns.
|
||||
/// By default, leftmost-first match semantics are used.
|
||||
///
|
||||
/// ```
|
||||
/// use aho_corasick::{packed::{MatchKind, Searcher}, PatternID};
|
||||
///
|
||||
/// # fn example() -> Option<()> {
|
||||
/// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
|
||||
/// let matches: Vec<PatternID> = searcher
|
||||
/// .find_iter("foobar")
|
||||
/// .map(|mat| mat.pattern())
|
||||
/// .collect();
|
||||
/// assert_eq!(vec![PatternID::ZERO], matches);
|
||||
/// # Some(()) }
|
||||
/// # if cfg!(all(feature = "std", any(
|
||||
/// # target_arch = "x86_64", target_arch = "aarch64",
|
||||
/// # ))) {
|
||||
/// # example().unwrap()
|
||||
/// # } else {
|
||||
/// # assert!(example().is_none());
|
||||
/// # }
|
||||
/// ```
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct Searcher {
|
||||
patterns: Arc<Patterns>,
|
||||
rabinkarp: RabinKarp,
|
||||
search_kind: SearchKind,
|
||||
minimum_len: usize,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
enum SearchKind {
|
||||
Teddy(teddy::Searcher),
|
||||
RabinKarp,
|
||||
}
|
||||
|
||||
impl Searcher {
|
||||
/// A convenience function for constructing a searcher from an iterator
|
||||
/// of things that can be converted to a `&[u8]`.
|
||||
///
|
||||
/// If a searcher could not be constructed (either because of an
|
||||
/// unsupported CPU or because there are too many patterns), then `None`
|
||||
/// is returned.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// Basic usage:
|
||||
///
|
||||
/// ```
|
||||
/// use aho_corasick::{packed::{MatchKind, Searcher}, PatternID};
|
||||
///
|
||||
/// # fn example() -> Option<()> {
|
||||
/// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
|
||||
/// let matches: Vec<PatternID> = searcher
|
||||
/// .find_iter("foobar")
|
||||
/// .map(|mat| mat.pattern())
|
||||
/// .collect();
|
||||
/// assert_eq!(vec![PatternID::ZERO], matches);
|
||||
/// # Some(()) }
|
||||
/// # if cfg!(all(feature = "std", any(
|
||||
/// # target_arch = "x86_64", target_arch = "aarch64",
|
||||
/// # ))) {
|
||||
/// # example().unwrap()
|
||||
/// # } else {
|
||||
/// # assert!(example().is_none());
|
||||
/// # }
|
||||
/// ```
|
||||
pub fn new<I, P>(patterns: I) -> Option<Searcher>
|
||||
where
|
||||
I: IntoIterator<Item = P>,
|
||||
P: AsRef<[u8]>,
|
||||
{
|
||||
Builder::new().extend(patterns).build()
|
||||
}
|
||||
|
||||
/// A convenience function for calling `Config::new()`.
|
||||
///
|
||||
/// This is useful for avoiding an additional import.
|
||||
pub fn config() -> Config {
|
||||
Config::new()
|
||||
}
|
||||
|
||||
/// A convenience function for calling `Builder::new()`.
|
||||
///
|
||||
/// This is useful for avoiding an additional import.
|
||||
pub fn builder() -> Builder {
|
||||
Builder::new()
|
||||
}
|
||||
|
||||
/// Return the first occurrence of any of the patterns in this searcher,
|
||||
/// according to its match semantics, in the given haystack. The `Match`
|
||||
/// returned will include the identifier of the pattern that matched, which
|
||||
/// corresponds to the index of the pattern (starting from `0`) in which it
|
||||
/// was added.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// Basic usage:
|
||||
///
|
||||
/// ```
|
||||
/// use aho_corasick::{packed::{MatchKind, Searcher}, PatternID};
|
||||
///
|
||||
/// # fn example() -> Option<()> {
|
||||
/// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
|
||||
/// let mat = searcher.find("foobar")?;
|
||||
/// assert_eq!(PatternID::ZERO, mat.pattern());
|
||||
/// assert_eq!(0, mat.start());
|
||||
/// assert_eq!(6, mat.end());
|
||||
/// # Some(()) }
|
||||
/// # if cfg!(all(feature = "std", any(
|
||||
/// # target_arch = "x86_64", target_arch = "aarch64",
|
||||
/// # ))) {
|
||||
/// # example().unwrap()
|
||||
/// # } else {
|
||||
/// # assert!(example().is_none());
|
||||
/// # }
|
||||
/// ```
|
||||
#[inline]
|
||||
pub fn find<B: AsRef<[u8]>>(&self, haystack: B) -> Option<Match> {
|
||||
let haystack = haystack.as_ref();
|
||||
self.find_in(haystack, Span::from(0..haystack.len()))
|
||||
}
|
||||
|
||||
/// Return the first occurrence of any of the patterns in this searcher,
|
||||
/// according to its match semantics, in the given haystack starting from
|
||||
/// the given position.
|
||||
///
|
||||
/// The `Match` returned will include the identifier of the pattern that
|
||||
/// matched, which corresponds to the index of the pattern (starting from
|
||||
/// `0`) in which it was added. The offsets in the `Match` will be relative
|
||||
/// to the start of `haystack` (and not `at`).
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// Basic usage:
|
||||
///
|
||||
/// ```
|
||||
/// use aho_corasick::{packed::{MatchKind, Searcher}, PatternID, Span};
|
||||
///
|
||||
/// # fn example() -> Option<()> {
|
||||
/// let haystack = "foofoobar";
|
||||
/// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
|
||||
/// let mat = searcher.find_in(haystack, Span::from(3..haystack.len()))?;
|
||||
/// assert_eq!(PatternID::ZERO, mat.pattern());
|
||||
/// assert_eq!(3, mat.start());
|
||||
/// assert_eq!(9, mat.end());
|
||||
/// # Some(()) }
|
||||
/// # if cfg!(all(feature = "std", any(
|
||||
/// # target_arch = "x86_64", target_arch = "aarch64",
|
||||
/// # ))) {
|
||||
/// # example().unwrap()
|
||||
/// # } else {
|
||||
/// # assert!(example().is_none());
|
||||
/// # }
|
||||
/// ```
|
||||
#[inline]
|
||||
pub fn find_in<B: AsRef<[u8]>>(
|
||||
&self,
|
||||
haystack: B,
|
||||
span: Span,
|
||||
) -> Option<Match> {
|
||||
let haystack = haystack.as_ref();
|
||||
match self.search_kind {
|
||||
SearchKind::Teddy(ref teddy) => {
|
||||
if haystack[span].len() < teddy.minimum_len() {
|
||||
return self.find_in_slow(haystack, span);
|
||||
}
|
||||
teddy.find(&haystack[..span.end], span.start)
|
||||
}
|
||||
SearchKind::RabinKarp => {
|
||||
self.rabinkarp.find_at(&haystack[..span.end], span.start)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Return an iterator of non-overlapping occurrences of the patterns in
|
||||
/// this searcher, according to its match semantics, in the given haystack.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// Basic usage:
|
||||
///
|
||||
/// ```
|
||||
/// use aho_corasick::{packed::{MatchKind, Searcher}, PatternID};
|
||||
///
|
||||
/// # fn example() -> Option<()> {
|
||||
/// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
|
||||
/// let matches: Vec<PatternID> = searcher
|
||||
/// .find_iter("foobar fooba foofoo")
|
||||
/// .map(|mat| mat.pattern())
|
||||
/// .collect();
|
||||
/// assert_eq!(vec![
|
||||
/// PatternID::must(0),
|
||||
/// PatternID::must(1),
|
||||
/// PatternID::must(1),
|
||||
/// PatternID::must(1),
|
||||
/// ], matches);
|
||||
/// # Some(()) }
|
||||
/// # if cfg!(all(feature = "std", any(
|
||||
/// # target_arch = "x86_64", target_arch = "aarch64",
|
||||
/// # ))) {
|
||||
/// # example().unwrap()
|
||||
/// # } else {
|
||||
/// # assert!(example().is_none());
|
||||
/// # }
|
||||
/// ```
|
||||
#[inline]
|
||||
pub fn find_iter<'a, 'b, B: ?Sized + AsRef<[u8]>>(
|
||||
&'a self,
|
||||
haystack: &'b B,
|
||||
) -> FindIter<'a, 'b> {
|
||||
let haystack = haystack.as_ref();
|
||||
let span = Span::from(0..haystack.len());
|
||||
FindIter { searcher: self, haystack, span }
|
||||
}
|
||||
|
||||
/// Returns the match kind used by this packed searcher.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// Basic usage:
|
||||
///
|
||||
/// ```
|
||||
/// use aho_corasick::packed::{MatchKind, Searcher};
|
||||
///
|
||||
/// # fn example() -> Option<()> {
|
||||
/// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
|
||||
/// // leftmost-first is the default.
|
||||
/// assert_eq!(&MatchKind::LeftmostFirst, searcher.match_kind());
|
||||
/// # Some(()) }
|
||||
/// # if cfg!(all(feature = "std", any(
|
||||
/// # target_arch = "x86_64", target_arch = "aarch64",
|
||||
/// # ))) {
|
||||
/// # example().unwrap()
|
||||
/// # } else {
|
||||
/// # assert!(example().is_none());
|
||||
/// # }
|
||||
/// ```
|
||||
#[inline]
|
||||
pub fn match_kind(&self) -> &MatchKind {
|
||||
self.patterns.match_kind()
|
||||
}
|
||||
|
||||
/// Returns the minimum length of a haystack that is required in order for
|
||||
/// packed searching to be effective.
|
||||
///
|
||||
/// In some cases, the underlying packed searcher may not be able to search
|
||||
/// very short haystacks. When that occurs, the implementation will defer
|
||||
/// to a slower non-packed searcher (which is still generally faster than
|
||||
/// Aho-Corasick for a small number of patterns). However, callers may
|
||||
/// want to avoid ever using the slower variant, which one can do by
|
||||
/// never passing a haystack shorter than the minimum length returned by
|
||||
/// this method.
|
||||
#[inline]
|
||||
pub fn minimum_len(&self) -> usize {
|
||||
self.minimum_len
|
||||
}
|
||||
|
||||
/// Returns the approximate total amount of heap used by this searcher, in
|
||||
/// units of bytes.
|
||||
#[inline]
|
||||
pub fn memory_usage(&self) -> usize {
|
||||
self.patterns.memory_usage()
|
||||
+ self.rabinkarp.memory_usage()
|
||||
+ self.search_kind.memory_usage()
|
||||
}
|
||||
|
||||
/// Use a slow (non-packed) searcher.
|
||||
///
|
||||
/// This is useful when a packed searcher could be constructed, but could
|
||||
/// not be used to search a specific haystack. For example, if Teddy was
|
||||
/// built but the haystack is smaller than ~34 bytes, then Teddy might not
|
||||
/// be able to run.
|
||||
fn find_in_slow(&self, haystack: &[u8], span: Span) -> Option<Match> {
|
||||
self.rabinkarp.find_at(&haystack[..span.end], span.start)
|
||||
}
|
||||
}
|
||||
|
||||
impl SearchKind {
|
||||
fn memory_usage(&self) -> usize {
|
||||
match *self {
|
||||
SearchKind::Teddy(ref ted) => ted.memory_usage(),
|
||||
SearchKind::RabinKarp => 0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// An iterator over non-overlapping matches from a packed searcher.
|
||||
///
|
||||
/// The lifetime `'s` refers to the lifetime of the underlying [`Searcher`],
|
||||
/// while the lifetime `'h` refers to the lifetime of the haystack being
|
||||
/// searched.
|
||||
#[derive(Debug)]
|
||||
pub struct FindIter<'s, 'h> {
|
||||
searcher: &'s Searcher,
|
||||
haystack: &'h [u8],
|
||||
span: Span,
|
||||
}
|
||||
|
||||
impl<'s, 'h> Iterator for FindIter<'s, 'h> {
|
||||
type Item = Match;
|
||||
|
||||
fn next(&mut self) -> Option<Match> {
|
||||
if self.span.start > self.span.end {
|
||||
return None;
|
||||
}
|
||||
match self.searcher.find_in(&self.haystack, self.span) {
|
||||
None => None,
|
||||
Some(m) => {
|
||||
self.span.start = m.end();
|
||||
Some(m)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
39
vendor/aho-corasick/src/packed/ext.rs
vendored
Normal file
39
vendor/aho-corasick/src/packed/ext.rs
vendored
Normal file
@ -0,0 +1,39 @@
|
||||
/// A trait for adding some helper routines to pointers.
|
||||
pub(crate) trait Pointer {
|
||||
/// Returns the distance, in units of `T`, between `self` and `origin`.
|
||||
///
|
||||
/// # Safety
|
||||
///
|
||||
/// Same as `ptr::offset_from` in addition to `self >= origin`.
|
||||
unsafe fn distance(self, origin: Self) -> usize;
|
||||
|
||||
/// Casts this pointer to `usize`.
|
||||
///
|
||||
/// Callers should not convert the `usize` back to a pointer if at all
|
||||
/// possible. (And if you believe it's necessary, open an issue to discuss
|
||||
/// why. Otherwise, it has the potential to violate pointer provenance.)
|
||||
/// The purpose of this function is just to be able to do arithmetic, i.e.,
|
||||
/// computing offsets or alignments.
|
||||
fn as_usize(self) -> usize;
|
||||
}
|
||||
|
||||
impl<T> Pointer for *const T {
|
||||
unsafe fn distance(self, origin: *const T) -> usize {
|
||||
// TODO: Replace with `ptr::sub_ptr` once stabilized.
|
||||
usize::try_from(self.offset_from(origin)).unwrap_unchecked()
|
||||
}
|
||||
|
||||
fn as_usize(self) -> usize {
|
||||
self as usize
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> Pointer for *mut T {
|
||||
unsafe fn distance(self, origin: *mut T) -> usize {
|
||||
(self as *const T).distance(origin as *const T)
|
||||
}
|
||||
|
||||
fn as_usize(self) -> usize {
|
||||
(self as *const T).as_usize()
|
||||
}
|
||||
}
|
120
vendor/aho-corasick/src/packed/mod.rs
vendored
Normal file
120
vendor/aho-corasick/src/packed/mod.rs
vendored
Normal file
@ -0,0 +1,120 @@
|
||||
/*!
|
||||
Provides packed multiple substring search, principally for a small number of
|
||||
patterns.
|
||||
|
||||
This sub-module provides vectorized routines for quickly finding
|
||||
matches of a small number of patterns. In general, users of this crate
|
||||
shouldn't need to interface with this module directly, as the primary
|
||||
[`AhoCorasick`](crate::AhoCorasick) searcher will use these routines
|
||||
automatically as a prefilter when applicable. However, in some cases, callers
|
||||
may want to bypass the Aho-Corasick machinery entirely and use this vectorized
|
||||
searcher directly.
|
||||
|
||||
# Overview
|
||||
|
||||
The primary types in this sub-module are:
|
||||
|
||||
* [`Searcher`] executes the actual search algorithm to report matches in a
|
||||
haystack.
|
||||
* [`Builder`] accumulates patterns incrementally and can construct a
|
||||
`Searcher`.
|
||||
* [`Config`] permits tuning the searcher, and itself will produce a `Builder`
|
||||
(which can then be used to build a `Searcher`). Currently, the only tuneable
|
||||
knob are the match semantics, but this may be expanded in the future.
|
||||
|
||||
# Examples
|
||||
|
||||
This example shows how to create a searcher from an iterator of patterns.
|
||||
By default, leftmost-first match semantics are used. (See the top-level
|
||||
[`MatchKind`] type for more details about match semantics, which apply
|
||||
similarly to packed substring search.)
|
||||
|
||||
```
|
||||
use aho_corasick::{packed::{MatchKind, Searcher}, PatternID};
|
||||
|
||||
# fn example() -> Option<()> {
|
||||
let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?;
|
||||
let matches: Vec<PatternID> = searcher
|
||||
.find_iter("foobar")
|
||||
.map(|mat| mat.pattern())
|
||||
.collect();
|
||||
assert_eq!(vec![PatternID::ZERO], matches);
|
||||
# Some(()) }
|
||||
# if cfg!(all(feature = "std", any(
|
||||
# target_arch = "x86_64", target_arch = "aarch64",
|
||||
# ))) {
|
||||
# example().unwrap()
|
||||
# } else {
|
||||
# assert!(example().is_none());
|
||||
# }
|
||||
```
|
||||
|
||||
This example shows how to use [`Config`] to change the match semantics to
|
||||
leftmost-longest:
|
||||
|
||||
```
|
||||
use aho_corasick::{packed::{Config, MatchKind}, PatternID};
|
||||
|
||||
# fn example() -> Option<()> {
|
||||
let searcher = Config::new()
|
||||
.match_kind(MatchKind::LeftmostLongest)
|
||||
.builder()
|
||||
.add("foo")
|
||||
.add("foobar")
|
||||
.build()?;
|
||||
let matches: Vec<PatternID> = searcher
|
||||
.find_iter("foobar")
|
||||
.map(|mat| mat.pattern())
|
||||
.collect();
|
||||
assert_eq!(vec![PatternID::must(1)], matches);
|
||||
# Some(()) }
|
||||
# if cfg!(all(feature = "std", any(
|
||||
# target_arch = "x86_64", target_arch = "aarch64",
|
||||
# ))) {
|
||||
# example().unwrap()
|
||||
# } else {
|
||||
# assert!(example().is_none());
|
||||
# }
|
||||
```
|
||||
|
||||
# Packed substring searching
|
||||
|
||||
Packed substring searching refers to the use of SIMD (Single Instruction,
|
||||
Multiple Data) to accelerate the detection of matches in a haystack. Unlike
|
||||
conventional algorithms, such as Aho-Corasick, SIMD algorithms for substring
|
||||
search tend to do better with a small number of patterns, where as Aho-Corasick
|
||||
generally maintains reasonably consistent performance regardless of the number
|
||||
of patterns you give it. Because of this, the vectorized searcher in this
|
||||
sub-module cannot be used as a general purpose searcher, since building the
|
||||
searcher may fail even when given a small number of patterns. However, in
|
||||
exchange, when searching for a small number of patterns, searching can be quite
|
||||
a bit faster than Aho-Corasick (sometimes by an order of magnitude).
|
||||
|
||||
The key take away here is that constructing a searcher from a list of patterns
|
||||
is a fallible operation with no clear rules for when it will fail. While the
|
||||
precise conditions under which building a searcher can fail is specifically an
|
||||
implementation detail, here are some common reasons:
|
||||
|
||||
* Too many patterns were given. Typically, the limit is on the order of 100 or
|
||||
so, but this limit may fluctuate based on available CPU features.
|
||||
* The available packed algorithms require CPU features that aren't available.
|
||||
For example, currently, this crate only provides packed algorithms for
|
||||
`x86_64` and `aarch64`. Therefore, constructing a packed searcher on any
|
||||
other target will always fail.
|
||||
* Zero patterns were given, or one of the patterns given was empty. Packed
|
||||
searchers require at least one pattern and that all patterns are non-empty.
|
||||
* Something else about the nature of the patterns (typically based on
|
||||
heuristics) suggests that a packed searcher would perform very poorly, so
|
||||
no searcher is built.
|
||||
*/
|
||||
|
||||
pub use crate::packed::api::{Builder, Config, FindIter, MatchKind, Searcher};
|
||||
|
||||
mod api;
|
||||
mod ext;
|
||||
mod pattern;
|
||||
mod rabinkarp;
|
||||
mod teddy;
|
||||
#[cfg(all(feature = "std", test))]
|
||||
mod tests;
|
||||
mod vector;
|
480
vendor/aho-corasick/src/packed/pattern.rs
vendored
Normal file
480
vendor/aho-corasick/src/packed/pattern.rs
vendored
Normal file
@ -0,0 +1,480 @@
|
||||
use core::{cmp, fmt, mem, u16, usize};
|
||||
|
||||
use alloc::{boxed::Box, string::String, vec, vec::Vec};
|
||||
|
||||
use crate::{
|
||||
packed::{api::MatchKind, ext::Pointer},
|
||||
PatternID,
|
||||
};
|
||||
|
||||
/// A non-empty collection of non-empty patterns to search for.
|
||||
///
|
||||
/// This collection of patterns is what is passed around to both execute
|
||||
/// searches and to construct the searchers themselves. Namely, this permits
|
||||
/// searches to avoid copying all of the patterns, and allows us to keep only
|
||||
/// one copy throughout all packed searchers.
|
||||
///
|
||||
/// Note that this collection is not a set. The same pattern can appear more
|
||||
/// than once.
|
||||
#[derive(Clone, Debug)]
|
||||
pub(crate) struct Patterns {
|
||||
/// The match semantics supported by this collection of patterns.
|
||||
///
|
||||
/// The match semantics determines the order of the iterator over patterns.
|
||||
/// For leftmost-first, patterns are provided in the same order as were
|
||||
/// provided by the caller. For leftmost-longest, patterns are provided in
|
||||
/// descending order of length, with ties broken by the order in which they
|
||||
/// were provided by the caller.
|
||||
kind: MatchKind,
|
||||
/// The collection of patterns, indexed by their identifier.
|
||||
by_id: Vec<Vec<u8>>,
|
||||
/// The order of patterns defined for iteration, given by pattern
|
||||
/// identifiers. The order of `by_id` and `order` is always the same for
|
||||
/// leftmost-first semantics, but may be different for leftmost-longest
|
||||
/// semantics.
|
||||
order: Vec<PatternID>,
|
||||
/// The length of the smallest pattern, in bytes.
|
||||
minimum_len: usize,
|
||||
/// The total number of pattern bytes across the entire collection. This
|
||||
/// is used for reporting total heap usage in constant time.
|
||||
total_pattern_bytes: usize,
|
||||
}
|
||||
|
||||
// BREADCRUMBS: I think we want to experiment with a different bucket
|
||||
// representation. Basically, each bucket is just a Range<usize> to a single
|
||||
// contiguous allocation? Maybe length-prefixed patterns or something? The
|
||||
// idea is to try to get rid of the pointer chasing in verification. I don't
|
||||
// know that that is the issue, but I suspect it is.
|
||||
|
||||
impl Patterns {
|
||||
/// Create a new collection of patterns for the given match semantics. The
|
||||
/// ID of each pattern is the index of the pattern at which it occurs in
|
||||
/// the `by_id` slice.
|
||||
///
|
||||
/// If any of the patterns in the slice given are empty, then this panics.
|
||||
/// Similarly, if the number of patterns given is zero, then this also
|
||||
/// panics.
|
||||
pub(crate) fn new() -> Patterns {
|
||||
Patterns {
|
||||
kind: MatchKind::default(),
|
||||
by_id: vec![],
|
||||
order: vec![],
|
||||
minimum_len: usize::MAX,
|
||||
total_pattern_bytes: 0,
|
||||
}
|
||||
}
|
||||
|
||||
/// Add a pattern to this collection.
|
||||
///
|
||||
/// This panics if the pattern given is empty.
|
||||
pub(crate) fn add(&mut self, bytes: &[u8]) {
|
||||
assert!(!bytes.is_empty());
|
||||
assert!(self.by_id.len() <= u16::MAX as usize);
|
||||
|
||||
let id = PatternID::new(self.by_id.len()).unwrap();
|
||||
self.order.push(id);
|
||||
self.by_id.push(bytes.to_vec());
|
||||
self.minimum_len = cmp::min(self.minimum_len, bytes.len());
|
||||
self.total_pattern_bytes += bytes.len();
|
||||
}
|
||||
|
||||
/// Set the match kind semantics for this collection of patterns.
|
||||
///
|
||||
/// If the kind is not set, then the default is leftmost-first.
|
||||
pub(crate) fn set_match_kind(&mut self, kind: MatchKind) {
|
||||
self.kind = kind;
|
||||
match self.kind {
|
||||
MatchKind::LeftmostFirst => {
|
||||
self.order.sort();
|
||||
}
|
||||
MatchKind::LeftmostLongest => {
|
||||
let (order, by_id) = (&mut self.order, &mut self.by_id);
|
||||
order.sort_by(|&id1, &id2| {
|
||||
by_id[id1].len().cmp(&by_id[id2].len()).reverse()
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Return the number of patterns in this collection.
|
||||
///
|
||||
/// This is guaranteed to be greater than zero.
|
||||
pub(crate) fn len(&self) -> usize {
|
||||
self.by_id.len()
|
||||
}
|
||||
|
||||
/// Returns true if and only if this collection of patterns is empty.
|
||||
pub(crate) fn is_empty(&self) -> bool {
|
||||
self.len() == 0
|
||||
}
|
||||
|
||||
/// Returns the approximate total amount of heap used by these patterns, in
|
||||
/// units of bytes.
|
||||
pub(crate) fn memory_usage(&self) -> usize {
|
||||
self.order.len() * mem::size_of::<PatternID>()
|
||||
+ self.by_id.len() * mem::size_of::<Vec<u8>>()
|
||||
+ self.total_pattern_bytes
|
||||
}
|
||||
|
||||
/// Clears all heap memory associated with this collection of patterns and
|
||||
/// resets all state such that it is a valid empty collection.
|
||||
pub(crate) fn reset(&mut self) {
|
||||
self.kind = MatchKind::default();
|
||||
self.by_id.clear();
|
||||
self.order.clear();
|
||||
self.minimum_len = usize::MAX;
|
||||
}
|
||||
|
||||
/// Returns the length, in bytes, of the smallest pattern.
|
||||
///
|
||||
/// This is guaranteed to be at least one.
|
||||
pub(crate) fn minimum_len(&self) -> usize {
|
||||
self.minimum_len
|
||||
}
|
||||
|
||||
/// Returns the match semantics used by these patterns.
|
||||
pub(crate) fn match_kind(&self) -> &MatchKind {
|
||||
&self.kind
|
||||
}
|
||||
|
||||
/// Return the pattern with the given identifier. If such a pattern does
|
||||
/// not exist, then this panics.
|
||||
pub(crate) fn get(&self, id: PatternID) -> Pattern<'_> {
|
||||
Pattern(&self.by_id[id])
|
||||
}
|
||||
|
||||
/// Return the pattern with the given identifier without performing bounds
|
||||
/// checks.
|
||||
///
|
||||
/// # Safety
|
||||
///
|
||||
/// Callers must ensure that a pattern with the given identifier exists
|
||||
/// before using this method.
|
||||
pub(crate) unsafe fn get_unchecked(&self, id: PatternID) -> Pattern<'_> {
|
||||
Pattern(self.by_id.get_unchecked(id.as_usize()))
|
||||
}
|
||||
|
||||
/// Return an iterator over all the patterns in this collection, in the
|
||||
/// order in which they should be matched.
|
||||
///
|
||||
/// Specifically, in a naive multi-pattern matcher, the following is
|
||||
/// guaranteed to satisfy the match semantics of this collection of
|
||||
/// patterns:
|
||||
///
|
||||
/// ```ignore
|
||||
/// for i in 0..haystack.len():
|
||||
/// for p in patterns.iter():
|
||||
/// if haystack[i..].starts_with(p.bytes()):
|
||||
/// return Match(p.id(), i, i + p.bytes().len())
|
||||
/// ```
|
||||
///
|
||||
/// Namely, among the patterns in a collection, if they are matched in
|
||||
/// the order provided by this iterator, then the result is guaranteed
|
||||
/// to satisfy the correct match semantics. (Either leftmost-first or
|
||||
/// leftmost-longest.)
|
||||
pub(crate) fn iter(&self) -> PatternIter<'_> {
|
||||
PatternIter { patterns: self, i: 0 }
|
||||
}
|
||||
}
|
||||
|
||||
/// An iterator over the patterns in the `Patterns` collection.
|
||||
///
|
||||
/// The order of the patterns provided by this iterator is consistent with the
|
||||
/// match semantics of the originating collection of patterns.
|
||||
///
|
||||
/// The lifetime `'p` corresponds to the lifetime of the collection of patterns
|
||||
/// this is iterating over.
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct PatternIter<'p> {
|
||||
patterns: &'p Patterns,
|
||||
i: usize,
|
||||
}
|
||||
|
||||
impl<'p> Iterator for PatternIter<'p> {
|
||||
type Item = (PatternID, Pattern<'p>);
|
||||
|
||||
fn next(&mut self) -> Option<(PatternID, Pattern<'p>)> {
|
||||
if self.i >= self.patterns.len() {
|
||||
return None;
|
||||
}
|
||||
let id = self.patterns.order[self.i];
|
||||
let p = self.patterns.get(id);
|
||||
self.i += 1;
|
||||
Some((id, p))
|
||||
}
|
||||
}
|
||||
|
||||
/// A pattern that is used in packed searching.
|
||||
#[derive(Clone)]
|
||||
pub(crate) struct Pattern<'a>(&'a [u8]);
|
||||
|
||||
impl<'a> fmt::Debug for Pattern<'a> {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
f.debug_struct("Pattern")
|
||||
.field("lit", &String::from_utf8_lossy(&self.0))
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
impl<'p> Pattern<'p> {
|
||||
/// Returns the length of this pattern, in bytes.
|
||||
pub(crate) fn len(&self) -> usize {
|
||||
self.0.len()
|
||||
}
|
||||
|
||||
/// Returns the bytes of this pattern.
|
||||
pub(crate) fn bytes(&self) -> &[u8] {
|
||||
&self.0
|
||||
}
|
||||
|
||||
/// Returns the first `len` low nybbles from this pattern. If this pattern
|
||||
/// is shorter than `len`, then this panics.
|
||||
pub(crate) fn low_nybbles(&self, len: usize) -> Box<[u8]> {
|
||||
let mut nybs = vec![0; len].into_boxed_slice();
|
||||
for (i, byte) in self.bytes().iter().take(len).enumerate() {
|
||||
nybs[i] = byte & 0xF;
|
||||
}
|
||||
nybs
|
||||
}
|
||||
|
||||
/// Returns true if this pattern is a prefix of the given bytes.
|
||||
#[inline(always)]
|
||||
pub(crate) fn is_prefix(&self, bytes: &[u8]) -> bool {
|
||||
is_prefix(bytes, self.bytes())
|
||||
}
|
||||
|
||||
/// Returns true if this pattern is a prefix of the haystack given by the
|
||||
/// raw `start` and `end` pointers.
|
||||
///
|
||||
/// # Safety
|
||||
///
|
||||
/// * It must be the case that `start < end` and that the distance between
|
||||
/// them is at least equal to `V::BYTES`. That is, it must always be valid
|
||||
/// to do at least an unaligned load of `V` at `start`.
|
||||
/// * Both `start` and `end` must be valid for reads.
|
||||
/// * Both `start` and `end` must point to an initialized value.
|
||||
/// * Both `start` and `end` must point to the same allocated object and
|
||||
/// must either be in bounds or at most one byte past the end of the
|
||||
/// allocated object.
|
||||
/// * Both `start` and `end` must be _derived from_ a pointer to the same
|
||||
/// object.
|
||||
/// * The distance between `start` and `end` must not overflow `isize`.
|
||||
/// * The distance being in bounds must not rely on "wrapping around" the
|
||||
/// address space.
|
||||
#[inline(always)]
|
||||
pub(crate) unsafe fn is_prefix_raw(
|
||||
&self,
|
||||
start: *const u8,
|
||||
end: *const u8,
|
||||
) -> bool {
|
||||
let patlen = self.bytes().len();
|
||||
let haylen = end.distance(start);
|
||||
if patlen > haylen {
|
||||
return false;
|
||||
}
|
||||
// SAFETY: We've checked that the haystack has length at least equal
|
||||
// to this pattern. All other safety concerns are the responsibility
|
||||
// of the caller.
|
||||
is_equal_raw(start, self.bytes().as_ptr(), patlen)
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns true if and only if `needle` is a prefix of `haystack`.
|
||||
///
|
||||
/// This uses a latency optimized variant of `memcmp` internally which *might*
|
||||
/// make this faster for very short strings.
|
||||
///
|
||||
/// # Inlining
|
||||
///
|
||||
/// This routine is marked `inline(always)`. If you want to call this function
|
||||
/// in a way that is not always inlined, you'll need to wrap a call to it in
|
||||
/// another function that is marked as `inline(never)` or just `inline`.
|
||||
#[inline(always)]
|
||||
fn is_prefix(haystack: &[u8], needle: &[u8]) -> bool {
|
||||
if needle.len() > haystack.len() {
|
||||
return false;
|
||||
}
|
||||
// SAFETY: Our pointers are derived directly from borrowed slices which
|
||||
// uphold all of our safety guarantees except for length. We account for
|
||||
// length with the check above.
|
||||
unsafe { is_equal_raw(haystack.as_ptr(), needle.as_ptr(), needle.len()) }
|
||||
}
|
||||
|
||||
/// Compare corresponding bytes in `x` and `y` for equality.
|
||||
///
|
||||
/// That is, this returns true if and only if `x.len() == y.len()` and
|
||||
/// `x[i] == y[i]` for all `0 <= i < x.len()`.
|
||||
///
|
||||
/// Note that this isn't used. We only use it in tests as a convenient way
|
||||
/// of testing `is_equal_raw`.
|
||||
///
|
||||
/// # Inlining
|
||||
///
|
||||
/// This routine is marked `inline(always)`. If you want to call this function
|
||||
/// in a way that is not always inlined, you'll need to wrap a call to it in
|
||||
/// another function that is marked as `inline(never)` or just `inline`.
|
||||
///
|
||||
/// # Motivation
|
||||
///
|
||||
/// Why not use slice equality instead? Well, slice equality usually results in
|
||||
/// a call out to the current platform's `libc` which might not be inlineable
|
||||
/// or have other overhead. This routine isn't guaranteed to be a win, but it
|
||||
/// might be in some cases.
|
||||
#[cfg(test)]
|
||||
#[inline(always)]
|
||||
fn is_equal(x: &[u8], y: &[u8]) -> bool {
|
||||
if x.len() != y.len() {
|
||||
return false;
|
||||
}
|
||||
// SAFETY: Our pointers are derived directly from borrowed slices which
|
||||
// uphold all of our safety guarantees except for length. We account for
|
||||
// length with the check above.
|
||||
unsafe { is_equal_raw(x.as_ptr(), y.as_ptr(), x.len()) }
|
||||
}
|
||||
|
||||
/// Compare `n` bytes at the given pointers for equality.
|
||||
///
|
||||
/// This returns true if and only if `*x.add(i) == *y.add(i)` for all
|
||||
/// `0 <= i < n`.
|
||||
///
|
||||
/// # Inlining
|
||||
///
|
||||
/// This routine is marked `inline(always)`. If you want to call this function
|
||||
/// in a way that is not always inlined, you'll need to wrap a call to it in
|
||||
/// another function that is marked as `inline(never)` or just `inline`.
|
||||
///
|
||||
/// # Motivation
|
||||
///
|
||||
/// Why not use slice equality instead? Well, slice equality usually results in
|
||||
/// a call out to the current platform's `libc` which might not be inlineable
|
||||
/// or have other overhead. This routine isn't guaranteed to be a win, but it
|
||||
/// might be in some cases.
|
||||
///
|
||||
/// # Safety
|
||||
///
|
||||
/// * Both `x` and `y` must be valid for reads of up to `n` bytes.
|
||||
/// * Both `x` and `y` must point to an initialized value.
|
||||
/// * Both `x` and `y` must each point to an allocated object and
|
||||
/// must either be in bounds or at most one byte past the end of the
|
||||
/// allocated object. `x` and `y` do not need to point to the same allocated
|
||||
/// object, but they may.
|
||||
/// * Both `x` and `y` must be _derived from_ a pointer to their respective
|
||||
/// allocated objects.
|
||||
/// * The distance between `x` and `x+n` must not overflow `isize`. Similarly
|
||||
/// for `y` and `y+n`.
|
||||
/// * The distance being in bounds must not rely on "wrapping around" the
|
||||
/// address space.
|
||||
#[inline(always)]
|
||||
unsafe fn is_equal_raw(mut x: *const u8, mut y: *const u8, n: usize) -> bool {
|
||||
// If we don't have enough bytes to do 4-byte at a time loads, then
|
||||
// handle each possible length specially. Note that I used to have a
|
||||
// byte-at-a-time loop here and that turned out to be quite a bit slower
|
||||
// for the memmem/pathological/defeat-simple-vector-alphabet benchmark.
|
||||
if n < 4 {
|
||||
return match n {
|
||||
0 => true,
|
||||
1 => x.read() == y.read(),
|
||||
2 => {
|
||||
x.cast::<u16>().read_unaligned()
|
||||
== y.cast::<u16>().read_unaligned()
|
||||
}
|
||||
// I also tried copy_nonoverlapping here and it looks like the
|
||||
// codegen is the same.
|
||||
3 => x.cast::<[u8; 3]>().read() == y.cast::<[u8; 3]>().read(),
|
||||
_ => unreachable!(),
|
||||
};
|
||||
}
|
||||
// When we have 4 or more bytes to compare, then proceed in chunks of 4 at
|
||||
// a time using unaligned loads.
|
||||
//
|
||||
// Also, why do 4 byte loads instead of, say, 8 byte loads? The reason is
|
||||
// that this particular version of memcmp is likely to be called with tiny
|
||||
// needles. That means that if we do 8 byte loads, then a higher proportion
|
||||
// of memcmp calls will use the slower variant above. With that said, this
|
||||
// is a hypothesis and is only loosely supported by benchmarks. There's
|
||||
// likely some improvement that could be made here. The main thing here
|
||||
// though is to optimize for latency, not throughput.
|
||||
|
||||
// SAFETY: The caller is responsible for ensuring the pointers we get are
|
||||
// valid and readable for at least `n` bytes. We also do unaligned loads,
|
||||
// so there's no need to ensure we're aligned. (This is justified by this
|
||||
// routine being specifically for short strings.)
|
||||
let xend = x.add(n.wrapping_sub(4));
|
||||
let yend = y.add(n.wrapping_sub(4));
|
||||
while x < xend {
|
||||
let vx = x.cast::<u32>().read_unaligned();
|
||||
let vy = y.cast::<u32>().read_unaligned();
|
||||
if vx != vy {
|
||||
return false;
|
||||
}
|
||||
x = x.add(4);
|
||||
y = y.add(4);
|
||||
}
|
||||
let vx = xend.cast::<u32>().read_unaligned();
|
||||
let vy = yend.cast::<u32>().read_unaligned();
|
||||
vx == vy
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn equals_different_lengths() {
|
||||
assert!(!is_equal(b"", b"a"));
|
||||
assert!(!is_equal(b"a", b""));
|
||||
assert!(!is_equal(b"ab", b"a"));
|
||||
assert!(!is_equal(b"a", b"ab"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn equals_mismatch() {
|
||||
let one_mismatch = [
|
||||
(&b"a"[..], &b"x"[..]),
|
||||
(&b"ab"[..], &b"ax"[..]),
|
||||
(&b"abc"[..], &b"abx"[..]),
|
||||
(&b"abcd"[..], &b"abcx"[..]),
|
||||
(&b"abcde"[..], &b"abcdx"[..]),
|
||||
(&b"abcdef"[..], &b"abcdex"[..]),
|
||||
(&b"abcdefg"[..], &b"abcdefx"[..]),
|
||||
(&b"abcdefgh"[..], &b"abcdefgx"[..]),
|
||||
(&b"abcdefghi"[..], &b"abcdefghx"[..]),
|
||||
(&b"abcdefghij"[..], &b"abcdefghix"[..]),
|
||||
(&b"abcdefghijk"[..], &b"abcdefghijx"[..]),
|
||||
(&b"abcdefghijkl"[..], &b"abcdefghijkx"[..]),
|
||||
(&b"abcdefghijklm"[..], &b"abcdefghijklx"[..]),
|
||||
(&b"abcdefghijklmn"[..], &b"abcdefghijklmx"[..]),
|
||||
];
|
||||
for (x, y) in one_mismatch {
|
||||
assert_eq!(x.len(), y.len(), "lengths should match");
|
||||
assert!(!is_equal(x, y));
|
||||
assert!(!is_equal(y, x));
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn equals_yes() {
|
||||
assert!(is_equal(b"", b""));
|
||||
assert!(is_equal(b"a", b"a"));
|
||||
assert!(is_equal(b"ab", b"ab"));
|
||||
assert!(is_equal(b"abc", b"abc"));
|
||||
assert!(is_equal(b"abcd", b"abcd"));
|
||||
assert!(is_equal(b"abcde", b"abcde"));
|
||||
assert!(is_equal(b"abcdef", b"abcdef"));
|
||||
assert!(is_equal(b"abcdefg", b"abcdefg"));
|
||||
assert!(is_equal(b"abcdefgh", b"abcdefgh"));
|
||||
assert!(is_equal(b"abcdefghi", b"abcdefghi"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn prefix() {
|
||||
assert!(is_prefix(b"", b""));
|
||||
assert!(is_prefix(b"a", b""));
|
||||
assert!(is_prefix(b"ab", b""));
|
||||
assert!(is_prefix(b"foo", b"foo"));
|
||||
assert!(is_prefix(b"foobar", b"foo"));
|
||||
|
||||
assert!(!is_prefix(b"foo", b"fob"));
|
||||
assert!(!is_prefix(b"foobar", b"fob"));
|
||||
}
|
||||
}
|
168
vendor/aho-corasick/src/packed/rabinkarp.rs
vendored
Normal file
168
vendor/aho-corasick/src/packed/rabinkarp.rs
vendored
Normal file
@ -0,0 +1,168 @@
|
||||
use alloc::{sync::Arc, vec, vec::Vec};
|
||||
|
||||
use crate::{packed::pattern::Patterns, util::search::Match, PatternID};
|
||||
|
||||
/// The type of the rolling hash used in the Rabin-Karp algorithm.
|
||||
type Hash = usize;
|
||||
|
||||
/// The number of buckets to store our patterns in. We don't want this to be
|
||||
/// too big in order to avoid wasting memory, but we don't want it to be too
|
||||
/// small either to avoid spending too much time confirming literals.
|
||||
///
|
||||
/// The number of buckets MUST be a power of two. Otherwise, determining the
|
||||
/// bucket from a hash will slow down the code considerably. Using a power
|
||||
/// of two means `hash % NUM_BUCKETS` can compile down to a simple `and`
|
||||
/// instruction.
|
||||
const NUM_BUCKETS: usize = 64;
|
||||
|
||||
/// An implementation of the Rabin-Karp algorithm. The main idea of this
|
||||
/// algorithm is to maintain a rolling hash as it moves through the input, and
|
||||
/// then check whether that hash corresponds to the same hash for any of the
|
||||
/// patterns we're looking for.
|
||||
///
|
||||
/// A draw back of naively scaling Rabin-Karp to multiple patterns is that
|
||||
/// it requires all of the patterns to be the same length, which in turn
|
||||
/// corresponds to the number of bytes to hash. We adapt this to work for
|
||||
/// multiple patterns of varying size by fixing the number of bytes to hash
|
||||
/// to be the length of the smallest pattern. We also split the patterns into
|
||||
/// several buckets to hopefully make the confirmation step faster.
|
||||
///
|
||||
/// Wikipedia has a decent explanation, if a bit heavy on the theory:
|
||||
/// https://en.wikipedia.org/wiki/Rabin%E2%80%93Karp_algorithm
|
||||
///
|
||||
/// But ESMAJ provides something a bit more concrete:
|
||||
/// https://www-igm.univ-mlv.fr/~lecroq/string/node5.html
|
||||
#[derive(Clone, Debug)]
|
||||
pub(crate) struct RabinKarp {
|
||||
/// The patterns we're searching for.
|
||||
patterns: Arc<Patterns>,
|
||||
/// The order of patterns in each bucket is significant. Namely, they are
|
||||
/// arranged such that the first one to match is the correct match. This
|
||||
/// may not necessarily correspond to the order provided by the caller.
|
||||
/// For example, if leftmost-longest semantics are used, then the patterns
|
||||
/// are sorted by their length in descending order. If leftmost-first
|
||||
/// semantics are used, then the patterns are sorted by their pattern ID
|
||||
/// in ascending order (which corresponds to the caller's order).
|
||||
buckets: Vec<Vec<(Hash, PatternID)>>,
|
||||
/// The length of the hashing window. Generally, this corresponds to the
|
||||
/// length of the smallest pattern.
|
||||
hash_len: usize,
|
||||
/// The factor to subtract out of a hash before updating it with a new
|
||||
/// byte.
|
||||
hash_2pow: usize,
|
||||
}
|
||||
|
||||
impl RabinKarp {
|
||||
/// Compile a new Rabin-Karp matcher from the patterns given.
|
||||
///
|
||||
/// This panics if any of the patterns in the collection are empty, or if
|
||||
/// the collection is itself empty.
|
||||
pub(crate) fn new(patterns: &Arc<Patterns>) -> RabinKarp {
|
||||
assert!(patterns.len() >= 1);
|
||||
let hash_len = patterns.minimum_len();
|
||||
assert!(hash_len >= 1);
|
||||
|
||||
let mut hash_2pow = 1usize;
|
||||
for _ in 1..hash_len {
|
||||
hash_2pow = hash_2pow.wrapping_shl(1);
|
||||
}
|
||||
|
||||
let mut rk = RabinKarp {
|
||||
patterns: Arc::clone(patterns),
|
||||
buckets: vec![vec![]; NUM_BUCKETS],
|
||||
hash_len,
|
||||
hash_2pow,
|
||||
};
|
||||
for (id, pat) in patterns.iter() {
|
||||
let hash = rk.hash(&pat.bytes()[..rk.hash_len]);
|
||||
let bucket = hash % NUM_BUCKETS;
|
||||
rk.buckets[bucket].push((hash, id));
|
||||
}
|
||||
rk
|
||||
}
|
||||
|
||||
/// Return the first matching pattern in the given haystack, begining the
|
||||
/// search at `at`.
|
||||
pub(crate) fn find_at(
|
||||
&self,
|
||||
haystack: &[u8],
|
||||
mut at: usize,
|
||||
) -> Option<Match> {
|
||||
assert_eq!(NUM_BUCKETS, self.buckets.len());
|
||||
|
||||
if at + self.hash_len > haystack.len() {
|
||||
return None;
|
||||
}
|
||||
let mut hash = self.hash(&haystack[at..at + self.hash_len]);
|
||||
loop {
|
||||
let bucket = &self.buckets[hash % NUM_BUCKETS];
|
||||
for &(phash, pid) in bucket {
|
||||
if phash == hash {
|
||||
if let Some(c) = self.verify(pid, haystack, at) {
|
||||
return Some(c);
|
||||
}
|
||||
}
|
||||
}
|
||||
if at + self.hash_len >= haystack.len() {
|
||||
return None;
|
||||
}
|
||||
hash = self.update_hash(
|
||||
hash,
|
||||
haystack[at],
|
||||
haystack[at + self.hash_len],
|
||||
);
|
||||
at += 1;
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the approximate total amount of heap used by this searcher, in
|
||||
/// units of bytes.
|
||||
pub(crate) fn memory_usage(&self) -> usize {
|
||||
self.buckets.len() * core::mem::size_of::<Vec<(Hash, PatternID)>>()
|
||||
+ self.patterns.len() * core::mem::size_of::<(Hash, PatternID)>()
|
||||
}
|
||||
|
||||
/// Verify whether the pattern with the given id matches at
|
||||
/// `haystack[at..]`.
|
||||
///
|
||||
/// We tag this function as `cold` because it helps improve codegen.
|
||||
/// Intuitively, it would seem like inlining it would be better. However,
|
||||
/// the only time this is called and a match is not found is when there
|
||||
/// there is a hash collision, or when a prefix of a pattern matches but
|
||||
/// the entire pattern doesn't match. This is hopefully fairly rare, and
|
||||
/// if it does occur a lot, it's going to be slow no matter what we do.
|
||||
#[cold]
|
||||
fn verify(
|
||||
&self,
|
||||
id: PatternID,
|
||||
haystack: &[u8],
|
||||
at: usize,
|
||||
) -> Option<Match> {
|
||||
let pat = self.patterns.get(id);
|
||||
if pat.is_prefix(&haystack[at..]) {
|
||||
Some(Match::new(id, at..at + pat.len()))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Hash the given bytes.
|
||||
fn hash(&self, bytes: &[u8]) -> Hash {
|
||||
assert_eq!(self.hash_len, bytes.len());
|
||||
|
||||
let mut hash = 0usize;
|
||||
for &b in bytes {
|
||||
hash = hash.wrapping_shl(1).wrapping_add(b as usize);
|
||||
}
|
||||
hash
|
||||
}
|
||||
|
||||
/// Update the hash given based on removing `old_byte` at the beginning
|
||||
/// of some byte string, and appending `new_byte` to the end of that same
|
||||
/// byte string.
|
||||
fn update_hash(&self, prev: Hash, old_byte: u8, new_byte: u8) -> Hash {
|
||||
prev.wrapping_sub((old_byte as usize).wrapping_mul(self.hash_2pow))
|
||||
.wrapping_shl(1)
|
||||
.wrapping_add(new_byte as usize)
|
||||
}
|
||||
}
|
386
vendor/aho-corasick/src/packed/teddy/README.md
vendored
Normal file
386
vendor/aho-corasick/src/packed/teddy/README.md
vendored
Normal file
@ -0,0 +1,386 @@
|
||||
Teddy is a SIMD accelerated multiple substring matching algorithm. The name
|
||||
and the core ideas in the algorithm were learned from the [Hyperscan][1_u]
|
||||
project. The implementation in this repository was mostly motivated for use in
|
||||
accelerating regex searches by searching for small sets of required literals
|
||||
extracted from the regex.
|
||||
|
||||
|
||||
# Background
|
||||
|
||||
The key idea of Teddy is to do *packed* substring matching. In the literature,
|
||||
packed substring matching is the idea of examining multiple bytes in a haystack
|
||||
at a time to detect matches. Implementations of, for example, memchr (which
|
||||
detects matches of a single byte) have been doing this for years. Only
|
||||
recently, with the introduction of various SIMD instructions, has this been
|
||||
extended to substring matching. The PCMPESTRI instruction (and its relatives),
|
||||
for example, implements substring matching in hardware. It is, however, limited
|
||||
to substrings of length 16 bytes or fewer, but this restriction is fine in a
|
||||
regex engine, since we rarely care about the performance difference between
|
||||
searching for a 16 byte literal and a 16 + N literal; 16 is already long
|
||||
enough. The key downside of the PCMPESTRI instruction, on current (2016) CPUs
|
||||
at least, is its latency and throughput. As a result, it is often faster to
|
||||
do substring search with a Boyer-Moore (or Two-Way) variant and a well placed
|
||||
memchr to quickly skip through the haystack.
|
||||
|
||||
There are fewer results from the literature on packed substring matching,
|
||||
and even fewer for packed multiple substring matching. Ben-Kiki et al. [2]
|
||||
describes use of PCMPESTRI for substring matching, but is mostly theoretical
|
||||
and hand-waves performance. There is other theoretical work done by Bille [3]
|
||||
as well.
|
||||
|
||||
The rest of the work in the field, as far as I'm aware, is by Faro and Kulekci
|
||||
and is generally focused on multiple pattern search. Their first paper [4a]
|
||||
introduces the concept of a fingerprint, which is computed for every block of
|
||||
N bytes in every pattern. The haystack is then scanned N bytes at a time and
|
||||
a fingerprint is computed in the same way it was computed for blocks in the
|
||||
patterns. If the fingerprint corresponds to one that was found in a pattern,
|
||||
then a verification step follows to confirm that one of the substrings with the
|
||||
corresponding fingerprint actually matches at the current location. Various
|
||||
implementation tricks are employed to make sure the fingerprint lookup is fast;
|
||||
typically by truncating the fingerprint. (This may, of course, provoke more
|
||||
steps in the verification process, so a balance must be struck.)
|
||||
|
||||
The main downside of [4a] is that the minimum substring length is 32 bytes,
|
||||
presumably because of how the algorithm uses certain SIMD instructions. This
|
||||
essentially makes it useless for general purpose regex matching, where a small
|
||||
number of short patterns is far more likely.
|
||||
|
||||
Faro and Kulekci published another paper [4b] that is conceptually very similar
|
||||
to [4a]. The key difference is that it uses the CRC32 instruction (introduced
|
||||
as part of SSE 4.2) to compute fingerprint values. This also enables the
|
||||
algorithm to work effectively on substrings as short as 7 bytes with 4 byte
|
||||
windows. 7 bytes is unfortunately still too long. The window could be
|
||||
technically shrunk to 2 bytes, thereby reducing minimum length to 3, but the
|
||||
small window size ends up negating most performance benefits—and it's likely
|
||||
the common case in a general purpose regex engine.
|
||||
|
||||
Faro and Kulekci also published [4c] that appears to be intended as a
|
||||
replacement to using PCMPESTRI. In particular, it is specifically motivated by
|
||||
the high throughput/latency time of PCMPESTRI and therefore chooses other SIMD
|
||||
instructions that are faster. While this approach works for short substrings,
|
||||
I personally couldn't see a way to generalize it to multiple substring search.
|
||||
|
||||
Faro and Kulekci have another paper [4d] that I haven't been able to read
|
||||
because it is behind a paywall.
|
||||
|
||||
|
||||
# Teddy
|
||||
|
||||
Finally, we get to Teddy. If the above literature review is complete, then it
|
||||
appears that Teddy is a novel algorithm. More than that, in my experience, it
|
||||
completely blows away the competition for short substrings, which is exactly
|
||||
what we want in a general purpose regex engine. Again, the algorithm appears
|
||||
to be developed by the authors of [Hyperscan][1_u]. Hyperscan was open sourced
|
||||
late 2015, and no earlier history could be found. Therefore, tracking the exact
|
||||
provenance of the algorithm with respect to the published literature seems
|
||||
difficult.
|
||||
|
||||
At a high level, Teddy works somewhat similarly to the fingerprint algorithms
|
||||
published by Faro and Kulekci, but Teddy does it in a way that scales a bit
|
||||
better. Namely:
|
||||
|
||||
1. Teddy's core algorithm scans the haystack in 16 (for SSE, or 32 for AVX)
|
||||
byte chunks. 16 (or 32) is significant because it corresponds to the number
|
||||
of bytes in a SIMD vector.
|
||||
2. Bitwise operations are performed on each chunk to discover if any region of
|
||||
it matches a set of precomputed fingerprints from the patterns. If there are
|
||||
matches, then a verification step is performed. In this implementation, our
|
||||
verification step is naive. This can be improved upon.
|
||||
|
||||
The details to make this work are quite clever. First, we must choose how to
|
||||
pick our fingerprints. In Hyperscan's implementation, I *believe* they use the
|
||||
last N bytes of each substring, where N must be at least the minimum length of
|
||||
any substring in the set being searched. In this implementation, we use the
|
||||
first N bytes of each substring. (The tradeoffs between these choices aren't
|
||||
yet clear to me.) We then must figure out how to quickly test whether an
|
||||
occurrence of any fingerprint from the set of patterns appears in a 16 byte
|
||||
block from the haystack. To keep things simple, let's assume N = 1 and examine
|
||||
some examples to motivate the approach. Here are our patterns:
|
||||
|
||||
```ignore
|
||||
foo
|
||||
bar
|
||||
baz
|
||||
```
|
||||
|
||||
The corresponding fingerprints, for N = 1, are `f`, `b` and `b`. Now let's set
|
||||
our 16 byte block to:
|
||||
|
||||
```ignore
|
||||
bat cat foo bump
|
||||
xxxxxxxxxxxxxxxx
|
||||
```
|
||||
|
||||
To cut to the chase, Teddy works by using bitsets. In particular, Teddy creates
|
||||
a mask that allows us to quickly compute membership of a fingerprint in a 16
|
||||
byte block that also tells which pattern the fingerprint corresponds to. In
|
||||
this case, our fingerprint is a single byte, so an appropriate abstraction is
|
||||
a map from a single byte to a list of patterns that contain that fingerprint:
|
||||
|
||||
```ignore
|
||||
f |--> foo
|
||||
b |--> bar, baz
|
||||
```
|
||||
|
||||
Now, all we need to do is figure out how to represent this map in vector space
|
||||
and use normal SIMD operations to perform a lookup. The first simplification
|
||||
we can make is to represent our patterns as bit fields occupying a single
|
||||
byte. This is important, because a single SIMD vector can store 16 bytes.
|
||||
|
||||
```ignore
|
||||
f |--> 00000001
|
||||
b |--> 00000010, 00000100
|
||||
```
|
||||
|
||||
How do we perform lookup though? It turns out that SSSE3 introduced a very cool
|
||||
instruction called PSHUFB. The instruction takes two SIMD vectors, `A` and `B`,
|
||||
and returns a third vector `C`. All vectors are treated as 16 8-bit integers.
|
||||
`C` is formed by `C[i] = A[B[i]]`. (This is a bit of a simplification, but true
|
||||
for the purposes of this algorithm. For full details, see [Intel's Intrinsics
|
||||
Guide][5_u].) This essentially lets us use the values in `B` to lookup values
|
||||
in `A`.
|
||||
|
||||
If we could somehow cause `B` to contain our 16 byte block from the haystack,
|
||||
and if `A` could contain our bitmasks, then we'd end up with something like
|
||||
this for `A`:
|
||||
|
||||
```ignore
|
||||
0x00 0x01 ... 0x62 ... 0x66 ... 0xFF
|
||||
A = 0 0 00000110 00000001 0
|
||||
```
|
||||
|
||||
And if `B` contains our window from our haystack, we could use shuffle to take
|
||||
the values from `B` and use them to look up our bitsets in `A`. But of course,
|
||||
we can't do this because `A` in the above example contains 256 bytes, which
|
||||
is much larger than the size of a SIMD vector.
|
||||
|
||||
Nybbles to the rescue! A nybble is 4 bits. Instead of one mask to hold all of
|
||||
our bitsets, we can use two masks, where one mask corresponds to the lower four
|
||||
bits of our fingerprint and the other mask corresponds to the upper four bits.
|
||||
So our map now looks like:
|
||||
|
||||
```ignore
|
||||
'f' & 0xF = 0x6 |--> 00000001
|
||||
'f' >> 4 = 0x6 |--> 00000111
|
||||
'b' & 0xF = 0x2 |--> 00000110
|
||||
'b' >> 4 = 0x6 |--> 00000111
|
||||
```
|
||||
|
||||
Notice that the bitsets for each nybble correspond to the union of all
|
||||
fingerprints that contain that nybble. For example, both `f` and `b` have the
|
||||
same upper 4 bits but differ on the lower 4 bits. Putting this together, we
|
||||
have `A0`, `A1` and `B`, where `A0` is our mask for the lower nybble, `A1` is
|
||||
our mask for the upper nybble and `B` is our 16 byte block from the haystack:
|
||||
|
||||
```ignore
|
||||
0x00 0x01 0x02 0x03 ... 0x06 ... 0xF
|
||||
A0 = 0 0 00000110 0 00000001 0
|
||||
A1 = 0 0 0 0 00000111 0
|
||||
B = b a t _ t p
|
||||
B = 0x62 0x61 0x74 0x20 0x74 0x70
|
||||
```
|
||||
|
||||
But of course, we can't use `B` with `PSHUFB` yet, since its values are 8 bits,
|
||||
and we need indexes that are at most 4 bits (corresponding to one of 16
|
||||
values). We can apply the same transformation to split `B` into lower and upper
|
||||
nybbles as we did `A`. As before, `B0` corresponds to the lower nybbles and
|
||||
`B1` corresponds to the upper nybbles:
|
||||
|
||||
```ignore
|
||||
b a t _ c a t _ f o o _ b u m p
|
||||
B0 = 0x2 0x1 0x4 0x0 0x3 0x1 0x4 0x0 0x6 0xF 0xF 0x0 0x2 0x5 0xD 0x0
|
||||
B1 = 0x6 0x6 0x7 0x2 0x6 0x6 0x7 0x2 0x6 0x6 0x6 0x2 0x6 0x7 0x6 0x7
|
||||
```
|
||||
|
||||
And now we have a nice correspondence. `B0` can index `A0` and `B1` can index
|
||||
`A1`. Here's what we get when we apply `C0 = PSHUFB(A0, B0)`:
|
||||
|
||||
```ignore
|
||||
b a ... f o ... p
|
||||
A0[0x2] A0[0x1] A0[0x6] A0[0xF] A0[0x0]
|
||||
C0 = 00000110 0 00000001 0 0
|
||||
```
|
||||
|
||||
And `C1 = PSHUFB(A1, B1)`:
|
||||
|
||||
```ignore
|
||||
b a ... f o ... p
|
||||
A1[0x6] A1[0x6] A1[0x6] A1[0x6] A1[0x7]
|
||||
C1 = 00000111 00000111 00000111 00000111 0
|
||||
```
|
||||
|
||||
Notice how neither one of `C0` or `C1` is guaranteed to report fully correct
|
||||
results all on its own. For example, `C1` claims that `b` is a fingerprint for
|
||||
the pattern `foo` (since `A1[0x6] = 00000111`), and that `o` is a fingerprint
|
||||
for all of our patterns. But if we combined `C0` and `C1` with an `AND`
|
||||
operation:
|
||||
|
||||
```ignore
|
||||
b a ... f o ... p
|
||||
C = 00000110 0 00000001 0 0
|
||||
```
|
||||
|
||||
Then we now have that `C[i]` contains a bitset corresponding to the matching
|
||||
fingerprints in a haystack's 16 byte block, where `i` is the `ith` byte in that
|
||||
block.
|
||||
|
||||
Once we have that, we can look for the position of the least significant bit
|
||||
in `C`. (Least significant because we only target little endian here. Thus,
|
||||
the least significant bytes correspond to bytes in our haystack at a lower
|
||||
address.) That position, modulo `8`, gives us the pattern that the fingerprint
|
||||
matches. That position, integer divided by `8`, also gives us the byte offset
|
||||
that the fingerprint occurs in inside the 16 byte haystack block. Using those
|
||||
two pieces of information, we can run a verification procedure that tries
|
||||
to match all substrings containing that fingerprint at that position in the
|
||||
haystack.
|
||||
|
||||
|
||||
# Implementation notes
|
||||
|
||||
The problem with the algorithm as described above is that it uses a single byte
|
||||
for a fingerprint. This will work well if the fingerprints are rare in the
|
||||
haystack (e.g., capital letters or special characters in normal English text),
|
||||
but if the fingerprints are common, you'll wind up spending too much time in
|
||||
the verification step, which effectively negates the performance benefits of
|
||||
scanning 16 bytes at a time. Remember, the key to the performance of this
|
||||
algorithm is to do as little work as possible per 16 (or 32) bytes.
|
||||
|
||||
This algorithm can be extrapolated in a relatively straight-forward way to use
|
||||
larger fingerprints. That is, instead of a single byte prefix, we might use a
|
||||
two or three byte prefix. The implementation here implements N = {1, 2, 3}
|
||||
and always picks the largest N possible. The rationale is that the bigger the
|
||||
fingerprint, the fewer verification steps we'll do. Of course, if N is too
|
||||
large, then we'll end up doing too much on each step.
|
||||
|
||||
The way to extend it is:
|
||||
|
||||
1. Add a mask for each byte in the fingerprint. (Remember that each mask is
|
||||
composed of two SIMD vectors.) This results in a value of `C` for each byte
|
||||
in the fingerprint while searching.
|
||||
2. When testing each 16 (or 32) byte block, each value of `C` must be shifted
|
||||
so that they are aligned. Once aligned, they should all be `AND`'d together.
|
||||
This will give you only the bitsets corresponding to the full match of the
|
||||
fingerprint. To do this, one needs to save the last byte (for N=2) or last
|
||||
two bytes (for N=3) from the previous iteration, and then line them up with
|
||||
the first one or two bytes of the next iteration.
|
||||
|
||||
## Verification
|
||||
|
||||
Verification generally follows the procedure outlined above. The tricky parts
|
||||
are in the right formulation of operations to get our bits out of our vectors.
|
||||
We have a limited set of operations available to us on SIMD vectors as 128-bit
|
||||
or 256-bit numbers, so we wind up needing to rip out 2 (or 4) 64-bit integers
|
||||
from our vectors, and then run our verification step on each of those. The
|
||||
verification step looks at the least significant bit set, and from its
|
||||
position, we can derive the byte offset and bucket. (Again, as described
|
||||
above.) Once we know the bucket, we do a fairly naive exhaustive search for
|
||||
every literal in that bucket. (Hyperscan is a bit smarter here and uses a hash
|
||||
table, but I haven't had time to thoroughly explore that. A few initial
|
||||
half-hearted attempts resulted in worse performance.)
|
||||
|
||||
## AVX
|
||||
|
||||
The AVX version of Teddy extrapolates almost perfectly from the SSE version.
|
||||
The only hickup is that PALIGNR is used to align chunks in the 16-bit version,
|
||||
and there is no equivalent instruction in AVX. AVX does have VPALIGNR, but it
|
||||
only works within 128-bit lanes. So there's a bit of tomfoolery to get around
|
||||
this by shuffling the vectors before calling VPALIGNR.
|
||||
|
||||
The only other aspect to AVX is that since our masks are still fundamentally
|
||||
16-bytes (0x0-0xF), they are duplicated to 32-bytes, so that they can apply to
|
||||
32-byte chunks.
|
||||
|
||||
## Fat Teddy
|
||||
|
||||
In the version of Teddy described above, 8 buckets are used to group patterns
|
||||
that we want to search for. However, when AVX is available, we can extend the
|
||||
number of buckets to 16 by permitting each byte in our masks to use 16-bits
|
||||
instead of 8-bits to represent the buckets it belongs to. (This variant is also
|
||||
in Hyperscan.) However, what we give up is the ability to scan 32 bytes at a
|
||||
time, even though we're using AVX. Instead, we have to scan 16 bytes at a time.
|
||||
What we gain, though, is (hopefully) less work in our verification routine.
|
||||
It patterns are more spread out across more buckets, then there should overall
|
||||
be fewer false positives. In general, Fat Teddy permits us to grow our capacity
|
||||
a bit and search for more literals before Teddy gets overwhelmed.
|
||||
|
||||
The tricky part of Fat Teddy is in how we adjust our masks and our verification
|
||||
procedure. For the masks, we simply represent the first 8 buckets in each of
|
||||
the low 16 bytes, and then the second 8 buckets in each of the high 16 bytes.
|
||||
Then, in the search loop, instead of loading 32 bytes from the haystack, we
|
||||
load the same 16 bytes from the haystack into both the low and high 16 byte
|
||||
portions of our 256-bit vector. So for example, a mask might look like this:
|
||||
|
||||
bits: 00100001 00000000 ... 11000000 00000000 00000001 ... 00000000
|
||||
byte: 31 30 16 15 14 0
|
||||
offset: 15 14 0 15 14 0
|
||||
buckets: 8-15 8-15 8-15 0-7 0-7 0-7
|
||||
|
||||
Where `byte` is the position in the vector (higher numbers corresponding to
|
||||
more significant bits), `offset` is the corresponding position in the haystack
|
||||
chunk, and `buckets` corresponds to the bucket assignments for that particular
|
||||
byte.
|
||||
|
||||
In particular, notice that the bucket assignments for offset `0` are spread
|
||||
out between bytes `0` and `16`. This works well for the chunk-by-chunk search
|
||||
procedure, but verification really wants to process all bucket assignments for
|
||||
each offset at once. Otherwise, we might wind up finding a match at offset
|
||||
`1` in one the first 8 buckets, when we really should have reported a match
|
||||
at offset `0` in one of the second 8 buckets. (Because we want the leftmost
|
||||
match.)
|
||||
|
||||
Thus, for verification, we rearrange the above vector such that it is a
|
||||
sequence of 16-bit integers, where the least significant 16-bit integer
|
||||
corresponds to all of the bucket assignments for offset `0`. So with the
|
||||
above vector, the least significant 16-bit integer would be
|
||||
|
||||
11000000 000000
|
||||
|
||||
which was taken from bytes `16` and `0`. Then the verification step pretty much
|
||||
runs as described, except with 16 buckets instead of 8.
|
||||
|
||||
|
||||
# References
|
||||
|
||||
- **[1]** [Hyperscan on GitHub](https://github.com/intel/hyperscan),
|
||||
[webpage](https://www.hyperscan.io/)
|
||||
- **[2a]** Ben-Kiki, O., Bille, P., Breslauer, D., Gasieniec, L., Grossi, R.,
|
||||
& Weimann, O. (2011).
|
||||
_Optimal packed string matching_.
|
||||
In LIPIcs-Leibniz International Proceedings in Informatics (Vol. 13).
|
||||
Schloss Dagstuhl-Leibniz-Zentrum fuer Informatik.
|
||||
DOI: 10.4230/LIPIcs.FSTTCS.2011.423.
|
||||
[PDF](https://drops.dagstuhl.de/opus/volltexte/2011/3355/pdf/37.pdf).
|
||||
- **[2b]** Ben-Kiki, O., Bille, P., Breslauer, D., Ga̧sieniec, L., Grossi, R.,
|
||||
& Weimann, O. (2014).
|
||||
_Towards optimal packed string matching_.
|
||||
Theoretical Computer Science, 525, 111-129.
|
||||
DOI: 10.1016/j.tcs.2013.06.013.
|
||||
[PDF](https://www.cs.haifa.ac.il/~oren/Publications/bpsm.pdf).
|
||||
- **[3]** Bille, P. (2011).
|
||||
_Fast searching in packed strings_.
|
||||
Journal of Discrete Algorithms, 9(1), 49-56.
|
||||
DOI: 10.1016/j.jda.2010.09.003.
|
||||
[PDF](https://www.sciencedirect.com/science/article/pii/S1570866710000353).
|
||||
- **[4a]** Faro, S., & Külekci, M. O. (2012, October).
|
||||
_Fast multiple string matching using streaming SIMD extensions technology_.
|
||||
In String Processing and Information Retrieval (pp. 217-228).
|
||||
Springer Berlin Heidelberg.
|
||||
DOI: 10.1007/978-3-642-34109-0_23.
|
||||
[PDF](https://www.dmi.unict.it/faro/papers/conference/faro32.pdf).
|
||||
- **[4b]** Faro, S., & Külekci, M. O. (2013, September).
|
||||
_Towards a Very Fast Multiple String Matching Algorithm for Short Patterns_.
|
||||
In Stringology (pp. 78-91).
|
||||
[PDF](https://www.dmi.unict.it/faro/papers/conference/faro36.pdf).
|
||||
- **[4c]** Faro, S., & Külekci, M. O. (2013, January).
|
||||
_Fast packed string matching for short patterns_.
|
||||
In Proceedings of the Meeting on Algorithm Engineering & Expermiments
|
||||
(pp. 113-121).
|
||||
Society for Industrial and Applied Mathematics.
|
||||
[PDF](https://arxiv.org/pdf/1209.6449.pdf).
|
||||
- **[4d]** Faro, S., & Külekci, M. O. (2014).
|
||||
_Fast and flexible packed string matching_.
|
||||
Journal of Discrete Algorithms, 28, 61-72.
|
||||
DOI: 10.1016/j.jda.2014.07.003.
|
||||
|
||||
[1_u]: https://github.com/intel/hyperscan
|
||||
[5_u]: https://software.intel.com/sites/landingpage/IntrinsicsGuide
|
780
vendor/aho-corasick/src/packed/teddy/builder.rs
vendored
Normal file
780
vendor/aho-corasick/src/packed/teddy/builder.rs
vendored
Normal file
@ -0,0 +1,780 @@
|
||||
use core::{
|
||||
fmt::Debug,
|
||||
panic::{RefUnwindSafe, UnwindSafe},
|
||||
};
|
||||
|
||||
use alloc::sync::Arc;
|
||||
|
||||
use crate::packed::{ext::Pointer, pattern::Patterns, teddy::generic::Match};
|
||||
|
||||
/// A builder for constructing a Teddy matcher.
|
||||
///
|
||||
/// The builder primarily permits fine grained configuration of the Teddy
|
||||
/// matcher. Most options are made only available for testing/benchmarking
|
||||
/// purposes. In reality, options are automatically determined by the nature
|
||||
/// and number of patterns given to the builder.
|
||||
#[derive(Clone, Debug)]
|
||||
pub(crate) struct Builder {
|
||||
/// When none, this is automatically determined. Otherwise, `false` means
|
||||
/// slim Teddy is used (8 buckets) and `true` means fat Teddy is used
|
||||
/// (16 buckets). Fat Teddy requires AVX2, so if that CPU feature isn't
|
||||
/// available and Fat Teddy was requested, no matcher will be built.
|
||||
only_fat: Option<bool>,
|
||||
/// When none, this is automatically determined. Otherwise, `false` means
|
||||
/// that 128-bit vectors will be used (up to SSSE3 instructions) where as
|
||||
/// `true` means that 256-bit vectors will be used. As with `fat`, if
|
||||
/// 256-bit vectors are requested and they aren't available, then a
|
||||
/// searcher will not be built.
|
||||
only_256bit: Option<bool>,
|
||||
/// When true (the default), the number of patterns will be used as a
|
||||
/// heuristic for refusing construction of a Teddy searcher. The point here
|
||||
/// is that too many patterns can overwhelm Teddy. But this can be disabled
|
||||
/// in cases where the caller knows better.
|
||||
heuristic_pattern_limits: bool,
|
||||
}
|
||||
|
||||
impl Default for Builder {
|
||||
fn default() -> Builder {
|
||||
Builder::new()
|
||||
}
|
||||
}
|
||||
|
||||
impl Builder {
|
||||
/// Create a new builder for configuring a Teddy matcher.
|
||||
pub(crate) fn new() -> Builder {
|
||||
Builder {
|
||||
only_fat: None,
|
||||
only_256bit: None,
|
||||
heuristic_pattern_limits: true,
|
||||
}
|
||||
}
|
||||
|
||||
/// Build a matcher for the set of patterns given. If a matcher could not
|
||||
/// be built, then `None` is returned.
|
||||
///
|
||||
/// Generally, a matcher isn't built if the necessary CPU features aren't
|
||||
/// available, an unsupported target or if the searcher is believed to be
|
||||
/// slower than standard techniques (i.e., if there are too many literals).
|
||||
pub(crate) fn build(&self, patterns: Arc<Patterns>) -> Option<Searcher> {
|
||||
self.build_imp(patterns)
|
||||
}
|
||||
|
||||
/// Require the use of Fat (true) or Slim (false) Teddy. Fat Teddy uses
|
||||
/// 16 buckets where as Slim Teddy uses 8 buckets. More buckets are useful
|
||||
/// for a larger set of literals.
|
||||
///
|
||||
/// `None` is the default, which results in an automatic selection based
|
||||
/// on the number of literals and available CPU features.
|
||||
pub(crate) fn only_fat(&mut self, yes: Option<bool>) -> &mut Builder {
|
||||
self.only_fat = yes;
|
||||
self
|
||||
}
|
||||
|
||||
/// Request the use of 256-bit vectors (true) or 128-bit vectors (false).
|
||||
/// Generally, a larger vector size is better since it either permits
|
||||
/// matching more patterns or matching more bytes in the haystack at once.
|
||||
///
|
||||
/// `None` is the default, which results in an automatic selection based on
|
||||
/// the number of literals and available CPU features.
|
||||
pub(crate) fn only_256bit(&mut self, yes: Option<bool>) -> &mut Builder {
|
||||
self.only_256bit = yes;
|
||||
self
|
||||
}
|
||||
|
||||
/// Request that heuristic limitations on the number of patterns be
|
||||
/// employed. This useful to disable for benchmarking where one wants to
|
||||
/// explore how Teddy performs on large number of patterns even if the
|
||||
/// heuristics would otherwise refuse construction.
|
||||
///
|
||||
/// This is enabled by default.
|
||||
pub(crate) fn heuristic_pattern_limits(
|
||||
&mut self,
|
||||
yes: bool,
|
||||
) -> &mut Builder {
|
||||
self.heuristic_pattern_limits = yes;
|
||||
self
|
||||
}
|
||||
|
||||
fn build_imp(&self, patterns: Arc<Patterns>) -> Option<Searcher> {
|
||||
let patlimit = self.heuristic_pattern_limits;
|
||||
// There's no particular reason why we limit ourselves to little endian
|
||||
// here, but it seems likely that some parts of Teddy as they are
|
||||
// currently written (e.g., the uses of `trailing_zeros`) are likely
|
||||
// wrong on non-little-endian targets. Such things are likely easy to
|
||||
// fix, but at the time of writing (2023/09/18), I actually do not know
|
||||
// how to test this code on a big-endian target. So for now, we're
|
||||
// conservative and just bail out.
|
||||
if !cfg!(target_endian = "little") {
|
||||
debug!("skipping Teddy because target isn't little endian");
|
||||
return None;
|
||||
}
|
||||
// Too many patterns will overwhelm Teddy and likely lead to slow
|
||||
// downs, typically in the verification step.
|
||||
if patlimit && patterns.len() > 64 {
|
||||
debug!("skipping Teddy because of too many patterns");
|
||||
return None;
|
||||
}
|
||||
|
||||
#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
|
||||
{
|
||||
use self::x86_64::{FatAVX2, SlimAVX2, SlimSSSE3};
|
||||
|
||||
let mask_len = core::cmp::min(4, patterns.minimum_len());
|
||||
let beefy = patterns.len() > 32;
|
||||
let has_avx2 = self::x86_64::is_available_avx2();
|
||||
let has_ssse3 = has_avx2 || self::x86_64::is_available_ssse3();
|
||||
let use_avx2 = if self.only_256bit == Some(true) {
|
||||
if !has_avx2 {
|
||||
debug!(
|
||||
"skipping Teddy because avx2 was demanded but unavailable"
|
||||
);
|
||||
return None;
|
||||
}
|
||||
true
|
||||
} else if self.only_256bit == Some(false) {
|
||||
if !has_ssse3 {
|
||||
debug!(
|
||||
"skipping Teddy because ssse3 was demanded but unavailable"
|
||||
);
|
||||
return None;
|
||||
}
|
||||
false
|
||||
} else if !has_ssse3 && !has_avx2 {
|
||||
debug!(
|
||||
"skipping Teddy because ssse3 and avx2 are unavailable"
|
||||
);
|
||||
return None;
|
||||
} else {
|
||||
has_avx2
|
||||
};
|
||||
let fat = match self.only_fat {
|
||||
None => use_avx2 && beefy,
|
||||
Some(false) => false,
|
||||
Some(true) if !use_avx2 => {
|
||||
debug!(
|
||||
"skipping Teddy because fat was demanded, but fat \
|
||||
Teddy requires avx2 which is unavailable"
|
||||
);
|
||||
return None;
|
||||
}
|
||||
Some(true) => true,
|
||||
};
|
||||
// Just like for aarch64, it's possible that too many patterns will
|
||||
// overhwelm Teddy. Unlike aarch64 though, we have Fat teddy which
|
||||
// helps things scale a bit more by spreading patterns over more
|
||||
// buckets.
|
||||
//
|
||||
// These thresholds were determined by looking at the measurements
|
||||
// for the rust/aho-corasick/packed/leftmost-first and
|
||||
// rust/aho-corasick/dfa/leftmost-first engines on the `teddy/`
|
||||
// benchmarks.
|
||||
if patlimit && mask_len == 1 && patterns.len() > 16 {
|
||||
debug!(
|
||||
"skipping Teddy (mask len: 1) because there are \
|
||||
too many patterns",
|
||||
);
|
||||
return None;
|
||||
}
|
||||
match (mask_len, use_avx2, fat) {
|
||||
(1, false, _) => {
|
||||
debug!("Teddy choice: 128-bit slim, 1 byte");
|
||||
SlimSSSE3::<1>::new(&patterns)
|
||||
}
|
||||
(1, true, false) => {
|
||||
debug!("Teddy choice: 256-bit slim, 1 byte");
|
||||
SlimAVX2::<1>::new(&patterns)
|
||||
}
|
||||
(1, true, true) => {
|
||||
debug!("Teddy choice: 256-bit fat, 1 byte");
|
||||
FatAVX2::<1>::new(&patterns)
|
||||
}
|
||||
(2, false, _) => {
|
||||
debug!("Teddy choice: 128-bit slim, 2 bytes");
|
||||
SlimSSSE3::<2>::new(&patterns)
|
||||
}
|
||||
(2, true, false) => {
|
||||
debug!("Teddy choice: 256-bit slim, 2 bytes");
|
||||
SlimAVX2::<2>::new(&patterns)
|
||||
}
|
||||
(2, true, true) => {
|
||||
debug!("Teddy choice: 256-bit fat, 2 bytes");
|
||||
FatAVX2::<2>::new(&patterns)
|
||||
}
|
||||
(3, false, _) => {
|
||||
debug!("Teddy choice: 128-bit slim, 3 bytes");
|
||||
SlimSSSE3::<3>::new(&patterns)
|
||||
}
|
||||
(3, true, false) => {
|
||||
debug!("Teddy choice: 256-bit slim, 3 bytes");
|
||||
SlimAVX2::<3>::new(&patterns)
|
||||
}
|
||||
(3, true, true) => {
|
||||
debug!("Teddy choice: 256-bit fat, 3 bytes");
|
||||
FatAVX2::<3>::new(&patterns)
|
||||
}
|
||||
(4, false, _) => {
|
||||
debug!("Teddy choice: 128-bit slim, 4 bytes");
|
||||
SlimSSSE3::<4>::new(&patterns)
|
||||
}
|
||||
(4, true, false) => {
|
||||
debug!("Teddy choice: 256-bit slim, 4 bytes");
|
||||
SlimAVX2::<4>::new(&patterns)
|
||||
}
|
||||
(4, true, true) => {
|
||||
debug!("Teddy choice: 256-bit fat, 4 bytes");
|
||||
FatAVX2::<4>::new(&patterns)
|
||||
}
|
||||
_ => {
|
||||
debug!("no supported Teddy configuration found");
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
#[cfg(target_arch = "aarch64")]
|
||||
{
|
||||
use self::aarch64::SlimNeon;
|
||||
|
||||
let mask_len = core::cmp::min(4, patterns.minimum_len());
|
||||
if self.only_256bit == Some(true) {
|
||||
debug!(
|
||||
"skipping Teddy because 256-bits were demanded \
|
||||
but unavailable"
|
||||
);
|
||||
return None;
|
||||
}
|
||||
if self.only_fat == Some(true) {
|
||||
debug!(
|
||||
"skipping Teddy because fat was demanded but unavailable"
|
||||
);
|
||||
}
|
||||
// Since we don't have Fat teddy in aarch64 (I think we'd want at
|
||||
// least 256-bit vectors for that), we need to be careful not to
|
||||
// allow too many patterns as it might overwhelm Teddy. Generally
|
||||
// speaking, as the mask length goes up, the more patterns we can
|
||||
// handle because the mask length results in fewer candidates
|
||||
// generated.
|
||||
//
|
||||
// These thresholds were determined by looking at the measurements
|
||||
// for the rust/aho-corasick/packed/leftmost-first and
|
||||
// rust/aho-corasick/dfa/leftmost-first engines on the `teddy/`
|
||||
// benchmarks.
|
||||
match mask_len {
|
||||
1 => {
|
||||
if patlimit && patterns.len() > 16 {
|
||||
debug!(
|
||||
"skipping Teddy (mask len: 1) because there are \
|
||||
too many patterns",
|
||||
);
|
||||
}
|
||||
debug!("Teddy choice: 128-bit slim, 1 byte");
|
||||
SlimNeon::<1>::new(&patterns)
|
||||
}
|
||||
2 => {
|
||||
if patlimit && patterns.len() > 32 {
|
||||
debug!(
|
||||
"skipping Teddy (mask len: 2) because there are \
|
||||
too many patterns",
|
||||
);
|
||||
}
|
||||
debug!("Teddy choice: 128-bit slim, 2 bytes");
|
||||
SlimNeon::<2>::new(&patterns)
|
||||
}
|
||||
3 => {
|
||||
if patlimit && patterns.len() > 48 {
|
||||
debug!(
|
||||
"skipping Teddy (mask len: 3) because there are \
|
||||
too many patterns",
|
||||
);
|
||||
}
|
||||
debug!("Teddy choice: 128-bit slim, 3 bytes");
|
||||
SlimNeon::<3>::new(&patterns)
|
||||
}
|
||||
4 => {
|
||||
debug!("Teddy choice: 128-bit slim, 4 bytes");
|
||||
SlimNeon::<4>::new(&patterns)
|
||||
}
|
||||
_ => {
|
||||
debug!("no supported Teddy configuration found");
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
#[cfg(not(any(
|
||||
all(target_arch = "x86_64", target_feature = "sse2"),
|
||||
target_arch = "aarch64"
|
||||
)))]
|
||||
{
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// A searcher that dispatches to one of several possible Teddy variants.
|
||||
#[derive(Clone, Debug)]
|
||||
pub(crate) struct Searcher {
|
||||
/// The Teddy variant we use. We use dynamic dispatch under the theory that
|
||||
/// it results in better codegen then a enum, although this is a specious
|
||||
/// claim.
|
||||
///
|
||||
/// This `Searcher` is essentially a wrapper for a `SearcherT` trait
|
||||
/// object. We just make `memory_usage` and `minimum_len` available without
|
||||
/// going through dynamic dispatch.
|
||||
imp: Arc<dyn SearcherT>,
|
||||
/// Total heap memory used by the Teddy variant.
|
||||
memory_usage: usize,
|
||||
/// The minimum haystack length this searcher can handle. It is intended
|
||||
/// for callers to use some other search routine (such as Rabin-Karp) in
|
||||
/// cases where the haystack (or remainer of the haystack) is too short.
|
||||
minimum_len: usize,
|
||||
}
|
||||
|
||||
impl Searcher {
|
||||
/// Look for the leftmost occurrence of any pattern in this search in the
|
||||
/// given haystack starting at the given position.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// This panics when `haystack[at..].len()` is less than the minimum length
|
||||
/// for this haystack.
|
||||
#[inline(always)]
|
||||
pub(crate) fn find(
|
||||
&self,
|
||||
haystack: &[u8],
|
||||
at: usize,
|
||||
) -> Option<crate::Match> {
|
||||
// SAFETY: The Teddy implementations all require a minimum haystack
|
||||
// length, and this is required for safety. Therefore, we assert it
|
||||
// here in order to make this method sound.
|
||||
assert!(haystack[at..].len() >= self.minimum_len);
|
||||
let hayptr = haystack.as_ptr();
|
||||
// SAFETY: Construction of the searcher guarantees that we are able
|
||||
// to run it in the current environment (i.e., we won't get an AVX2
|
||||
// searcher on a x86-64 CPU without AVX2 support). Also, the pointers
|
||||
// are valid as they are derived directly from a borrowed slice.
|
||||
let teddym = unsafe {
|
||||
self.imp.find(hayptr.add(at), hayptr.add(haystack.len()))?
|
||||
};
|
||||
let start = teddym.start().as_usize().wrapping_sub(hayptr.as_usize());
|
||||
let end = teddym.end().as_usize().wrapping_sub(hayptr.as_usize());
|
||||
let span = crate::Span { start, end };
|
||||
// OK because we won't permit the construction of a searcher that
|
||||
// could report a pattern ID bigger than what can fit in the crate-wide
|
||||
// PatternID type.
|
||||
let pid = crate::PatternID::new_unchecked(teddym.pattern().as_usize());
|
||||
let m = crate::Match::new(pid, span);
|
||||
Some(m)
|
||||
}
|
||||
|
||||
/// Returns the approximate total amount of heap used by this type, in
|
||||
/// units of bytes.
|
||||
#[inline(always)]
|
||||
pub(crate) fn memory_usage(&self) -> usize {
|
||||
self.memory_usage
|
||||
}
|
||||
|
||||
/// Returns the minimum length, in bytes, that a haystack must be in order
|
||||
/// to use it with this searcher.
|
||||
#[inline(always)]
|
||||
pub(crate) fn minimum_len(&self) -> usize {
|
||||
self.minimum_len
|
||||
}
|
||||
}
|
||||
|
||||
/// A trait that provides dynamic dispatch over the different possible Teddy
|
||||
/// variants on the same algorithm.
|
||||
///
|
||||
/// On `x86_64` for example, it isn't known until runtime which of 12 possible
|
||||
/// variants will be used. One might use one of the four slim 128-bit vector
|
||||
/// variants, or one of the four 256-bit vector variants or even one of the
|
||||
/// four fat 256-bit vector variants.
|
||||
///
|
||||
/// Since this choice is generally made when the Teddy searcher is constructed
|
||||
/// and this choice is based on the patterns given and what the current CPU
|
||||
/// supports, it follows that there must be some kind of indirection at search
|
||||
/// time that "selects" the variant chosen at build time.
|
||||
///
|
||||
/// There are a few different ways to go about this. One approach is to use an
|
||||
/// enum. It works fine, but in my experiments, this generally results in worse
|
||||
/// codegen. Another approach, which is what we use here, is dynamic dispatch
|
||||
/// via a trait object. We basically implement this trait for each possible
|
||||
/// variant, select the variant we want at build time and convert it to a
|
||||
/// trait object for use at search time.
|
||||
///
|
||||
/// Another approach is to use function pointers and stick each of the possible
|
||||
/// variants into a union. This is essentially isomorphic to the dynamic
|
||||
/// dispatch approach, but doesn't require any allocations. Since this crate
|
||||
/// requires `alloc`, there's no real reason (AFAIK) to go down this path. (The
|
||||
/// `memchr` crate does this.)
|
||||
trait SearcherT:
|
||||
Debug + Send + Sync + UnwindSafe + RefUnwindSafe + 'static
|
||||
{
|
||||
/// Execute a search on the given haystack (identified by `start` and `end`
|
||||
/// raw pointers).
|
||||
///
|
||||
/// # Safety
|
||||
///
|
||||
/// Essentially, the `start` and `end` pointers must be valid and point
|
||||
/// to a haystack one can read. As long as you derive them from, for
|
||||
/// example, a `&[u8]`, they should automatically satisfy all of the safety
|
||||
/// obligations:
|
||||
///
|
||||
/// * Both `start` and `end` must be valid for reads.
|
||||
/// * Both `start` and `end` must point to an initialized value.
|
||||
/// * Both `start` and `end` must point to the same allocated object and
|
||||
/// must either be in bounds or at most one byte past the end of the
|
||||
/// allocated object.
|
||||
/// * Both `start` and `end` must be _derived from_ a pointer to the same
|
||||
/// object.
|
||||
/// * The distance between `start` and `end` must not overflow `isize`.
|
||||
/// * The distance being in bounds must not rely on "wrapping around" the
|
||||
/// address space.
|
||||
/// * It must be the case that `start <= end`.
|
||||
/// * `end - start` must be greater than the minimum length for this
|
||||
/// searcher.
|
||||
///
|
||||
/// Also, it is expected that implementations of this trait will tag this
|
||||
/// method with a `target_feature` attribute. Callers must ensure that
|
||||
/// they are executing this method in an environment where that attribute
|
||||
/// is valid.
|
||||
unsafe fn find(&self, start: *const u8, end: *const u8) -> Option<Match>;
|
||||
}
|
||||
|
||||
#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
|
||||
mod x86_64 {
|
||||
use core::arch::x86_64::{__m128i, __m256i};
|
||||
|
||||
use alloc::sync::Arc;
|
||||
|
||||
use crate::packed::{
|
||||
ext::Pointer,
|
||||
pattern::Patterns,
|
||||
teddy::generic::{self, Match},
|
||||
};
|
||||
|
||||
use super::{Searcher, SearcherT};
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub(super) struct SlimSSSE3<const BYTES: usize> {
|
||||
slim128: generic::Slim<__m128i, BYTES>,
|
||||
}
|
||||
|
||||
// Defines SlimSSSE3 wrapper functions for 1, 2, 3 and 4 bytes.
|
||||
macro_rules! slim_ssse3 {
|
||||
($len:expr) => {
|
||||
impl SlimSSSE3<$len> {
|
||||
/// Creates a new searcher using "slim" Teddy with 128-bit
|
||||
/// vectors. If SSSE3 is not available in the current
|
||||
/// environment, then this returns `None`.
|
||||
pub(super) fn new(
|
||||
patterns: &Arc<Patterns>,
|
||||
) -> Option<Searcher> {
|
||||
if !is_available_ssse3() {
|
||||
return None;
|
||||
}
|
||||
Some(unsafe { SlimSSSE3::<$len>::new_unchecked(patterns) })
|
||||
}
|
||||
|
||||
/// Creates a new searcher using "slim" Teddy with 256-bit
|
||||
/// vectors without checking whether SSSE3 is available or not.
|
||||
///
|
||||
/// # Safety
|
||||
///
|
||||
/// Callers must ensure that SSSE3 is available in the current
|
||||
/// environment.
|
||||
#[target_feature(enable = "ssse3")]
|
||||
unsafe fn new_unchecked(patterns: &Arc<Patterns>) -> Searcher {
|
||||
let slim128 = generic::Slim::<__m128i, $len>::new(
|
||||
Arc::clone(patterns),
|
||||
);
|
||||
let memory_usage = slim128.memory_usage();
|
||||
let minimum_len = slim128.minimum_len();
|
||||
let imp = Arc::new(SlimSSSE3 { slim128 });
|
||||
Searcher { imp, memory_usage, minimum_len }
|
||||
}
|
||||
}
|
||||
|
||||
impl SearcherT for SlimSSSE3<$len> {
|
||||
#[target_feature(enable = "ssse3")]
|
||||
#[inline]
|
||||
unsafe fn find(
|
||||
&self,
|
||||
start: *const u8,
|
||||
end: *const u8,
|
||||
) -> Option<Match> {
|
||||
// SAFETY: All obligations except for `target_feature` are
|
||||
// passed to the caller. Our use of `target_feature` is
|
||||
// safe because construction of this type requires that the
|
||||
// requisite target features are available.
|
||||
self.slim128.find(start, end)
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
slim_ssse3!(1);
|
||||
slim_ssse3!(2);
|
||||
slim_ssse3!(3);
|
||||
slim_ssse3!(4);
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub(super) struct SlimAVX2<const BYTES: usize> {
|
||||
slim128: generic::Slim<__m128i, BYTES>,
|
||||
slim256: generic::Slim<__m256i, BYTES>,
|
||||
}
|
||||
|
||||
// Defines SlimAVX2 wrapper functions for 1, 2, 3 and 4 bytes.
|
||||
macro_rules! slim_avx2 {
|
||||
($len:expr) => {
|
||||
impl SlimAVX2<$len> {
|
||||
/// Creates a new searcher using "slim" Teddy with 256-bit
|
||||
/// vectors. If AVX2 is not available in the current
|
||||
/// environment, then this returns `None`.
|
||||
pub(super) fn new(
|
||||
patterns: &Arc<Patterns>,
|
||||
) -> Option<Searcher> {
|
||||
if !is_available_avx2() {
|
||||
return None;
|
||||
}
|
||||
Some(unsafe { SlimAVX2::<$len>::new_unchecked(patterns) })
|
||||
}
|
||||
|
||||
/// Creates a new searcher using "slim" Teddy with 256-bit
|
||||
/// vectors without checking whether AVX2 is available or not.
|
||||
///
|
||||
/// # Safety
|
||||
///
|
||||
/// Callers must ensure that AVX2 is available in the current
|
||||
/// environment.
|
||||
#[target_feature(enable = "avx2")]
|
||||
unsafe fn new_unchecked(patterns: &Arc<Patterns>) -> Searcher {
|
||||
let slim128 = generic::Slim::<__m128i, $len>::new(
|
||||
Arc::clone(&patterns),
|
||||
);
|
||||
let slim256 = generic::Slim::<__m256i, $len>::new(
|
||||
Arc::clone(&patterns),
|
||||
);
|
||||
let memory_usage =
|
||||
slim128.memory_usage() + slim256.memory_usage();
|
||||
let minimum_len = slim128.minimum_len();
|
||||
let imp = Arc::new(SlimAVX2 { slim128, slim256 });
|
||||
Searcher { imp, memory_usage, minimum_len }
|
||||
}
|
||||
}
|
||||
|
||||
impl SearcherT for SlimAVX2<$len> {
|
||||
#[target_feature(enable = "avx2")]
|
||||
#[inline]
|
||||
unsafe fn find(
|
||||
&self,
|
||||
start: *const u8,
|
||||
end: *const u8,
|
||||
) -> Option<Match> {
|
||||
// SAFETY: All obligations except for `target_feature` are
|
||||
// passed to the caller. Our use of `target_feature` is
|
||||
// safe because construction of this type requires that the
|
||||
// requisite target features are available.
|
||||
let len = end.distance(start);
|
||||
if len < self.slim256.minimum_len() {
|
||||
self.slim128.find(start, end)
|
||||
} else {
|
||||
self.slim256.find(start, end)
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
slim_avx2!(1);
|
||||
slim_avx2!(2);
|
||||
slim_avx2!(3);
|
||||
slim_avx2!(4);
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub(super) struct FatAVX2<const BYTES: usize> {
|
||||
fat256: generic::Fat<__m256i, BYTES>,
|
||||
}
|
||||
|
||||
// Defines SlimAVX2 wrapper functions for 1, 2, 3 and 4 bytes.
|
||||
macro_rules! fat_avx2 {
|
||||
($len:expr) => {
|
||||
impl FatAVX2<$len> {
|
||||
/// Creates a new searcher using "slim" Teddy with 256-bit
|
||||
/// vectors. If AVX2 is not available in the current
|
||||
/// environment, then this returns `None`.
|
||||
pub(super) fn new(
|
||||
patterns: &Arc<Patterns>,
|
||||
) -> Option<Searcher> {
|
||||
if !is_available_avx2() {
|
||||
return None;
|
||||
}
|
||||
Some(unsafe { FatAVX2::<$len>::new_unchecked(patterns) })
|
||||
}
|
||||
|
||||
/// Creates a new searcher using "slim" Teddy with 256-bit
|
||||
/// vectors without checking whether AVX2 is available or not.
|
||||
///
|
||||
/// # Safety
|
||||
///
|
||||
/// Callers must ensure that AVX2 is available in the current
|
||||
/// environment.
|
||||
#[target_feature(enable = "avx2")]
|
||||
unsafe fn new_unchecked(patterns: &Arc<Patterns>) -> Searcher {
|
||||
let fat256 = generic::Fat::<__m256i, $len>::new(
|
||||
Arc::clone(&patterns),
|
||||
);
|
||||
let memory_usage = fat256.memory_usage();
|
||||
let minimum_len = fat256.minimum_len();
|
||||
let imp = Arc::new(FatAVX2 { fat256 });
|
||||
Searcher { imp, memory_usage, minimum_len }
|
||||
}
|
||||
}
|
||||
|
||||
impl SearcherT for FatAVX2<$len> {
|
||||
#[target_feature(enable = "avx2")]
|
||||
#[inline]
|
||||
unsafe fn find(
|
||||
&self,
|
||||
start: *const u8,
|
||||
end: *const u8,
|
||||
) -> Option<Match> {
|
||||
// SAFETY: All obligations except for `target_feature` are
|
||||
// passed to the caller. Our use of `target_feature` is
|
||||
// safe because construction of this type requires that the
|
||||
// requisite target features are available.
|
||||
self.fat256.find(start, end)
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
fat_avx2!(1);
|
||||
fat_avx2!(2);
|
||||
fat_avx2!(3);
|
||||
fat_avx2!(4);
|
||||
|
||||
#[inline]
|
||||
pub(super) fn is_available_ssse3() -> bool {
|
||||
#[cfg(not(target_feature = "sse2"))]
|
||||
{
|
||||
false
|
||||
}
|
||||
#[cfg(target_feature = "sse2")]
|
||||
{
|
||||
#[cfg(target_feature = "ssse3")]
|
||||
{
|
||||
true
|
||||
}
|
||||
#[cfg(not(target_feature = "ssse3"))]
|
||||
{
|
||||
#[cfg(feature = "std")]
|
||||
{
|
||||
std::is_x86_feature_detected!("ssse3")
|
||||
}
|
||||
#[cfg(not(feature = "std"))]
|
||||
{
|
||||
false
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub(super) fn is_available_avx2() -> bool {
|
||||
#[cfg(not(target_feature = "sse2"))]
|
||||
{
|
||||
false
|
||||
}
|
||||
#[cfg(target_feature = "sse2")]
|
||||
{
|
||||
#[cfg(target_feature = "avx2")]
|
||||
{
|
||||
true
|
||||
}
|
||||
#[cfg(not(target_feature = "avx2"))]
|
||||
{
|
||||
#[cfg(feature = "std")]
|
||||
{
|
||||
std::is_x86_feature_detected!("avx2")
|
||||
}
|
||||
#[cfg(not(feature = "std"))]
|
||||
{
|
||||
false
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(target_arch = "aarch64")]
|
||||
mod aarch64 {
|
||||
use core::arch::aarch64::uint8x16_t;
|
||||
|
||||
use alloc::sync::Arc;
|
||||
|
||||
use crate::packed::{
|
||||
pattern::Patterns,
|
||||
teddy::generic::{self, Match},
|
||||
};
|
||||
|
||||
use super::{Searcher, SearcherT};
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub(super) struct SlimNeon<const BYTES: usize> {
|
||||
slim128: generic::Slim<uint8x16_t, BYTES>,
|
||||
}
|
||||
|
||||
// Defines SlimSSSE3 wrapper functions for 1, 2, 3 and 4 bytes.
|
||||
macro_rules! slim_neon {
|
||||
($len:expr) => {
|
||||
impl SlimNeon<$len> {
|
||||
/// Creates a new searcher using "slim" Teddy with 128-bit
|
||||
/// vectors. If SSSE3 is not available in the current
|
||||
/// environment, then this returns `None`.
|
||||
pub(super) fn new(
|
||||
patterns: &Arc<Patterns>,
|
||||
) -> Option<Searcher> {
|
||||
Some(unsafe { SlimNeon::<$len>::new_unchecked(patterns) })
|
||||
}
|
||||
|
||||
/// Creates a new searcher using "slim" Teddy with 256-bit
|
||||
/// vectors without checking whether SSSE3 is available or not.
|
||||
///
|
||||
/// # Safety
|
||||
///
|
||||
/// Callers must ensure that SSSE3 is available in the current
|
||||
/// environment.
|
||||
#[target_feature(enable = "neon")]
|
||||
unsafe fn new_unchecked(patterns: &Arc<Patterns>) -> Searcher {
|
||||
let slim128 = generic::Slim::<uint8x16_t, $len>::new(
|
||||
Arc::clone(patterns),
|
||||
);
|
||||
let memory_usage = slim128.memory_usage();
|
||||
let minimum_len = slim128.minimum_len();
|
||||
let imp = Arc::new(SlimNeon { slim128 });
|
||||
Searcher { imp, memory_usage, minimum_len }
|
||||
}
|
||||
}
|
||||
|
||||
impl SearcherT for SlimNeon<$len> {
|
||||
#[target_feature(enable = "neon")]
|
||||
#[inline]
|
||||
unsafe fn find(
|
||||
&self,
|
||||
start: *const u8,
|
||||
end: *const u8,
|
||||
) -> Option<Match> {
|
||||
// SAFETY: All obligations except for `target_feature` are
|
||||
// passed to the caller. Our use of `target_feature` is
|
||||
// safe because construction of this type requires that the
|
||||
// requisite target features are available.
|
||||
self.slim128.find(start, end)
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
slim_neon!(1);
|
||||
slim_neon!(2);
|
||||
slim_neon!(3);
|
||||
slim_neon!(4);
|
||||
}
|
1382
vendor/aho-corasick/src/packed/teddy/generic.rs
vendored
Normal file
1382
vendor/aho-corasick/src/packed/teddy/generic.rs
vendored
Normal file
File diff suppressed because it is too large
Load Diff
9
vendor/aho-corasick/src/packed/teddy/mod.rs
vendored
Normal file
9
vendor/aho-corasick/src/packed/teddy/mod.rs
vendored
Normal file
@ -0,0 +1,9 @@
|
||||
// Regrettable, but Teddy stuff just isn't used on all targets. And for some
|
||||
// targets, like aarch64, only "slim" Teddy is used and so "fat" Teddy gets a
|
||||
// bunch of dead-code warnings. Just not worth trying to squash them. Blech.
|
||||
#![allow(dead_code)]
|
||||
|
||||
pub(crate) use self::builder::{Builder, Searcher};
|
||||
|
||||
mod builder;
|
||||
mod generic;
|
583
vendor/aho-corasick/src/packed/tests.rs
vendored
Normal file
583
vendor/aho-corasick/src/packed/tests.rs
vendored
Normal file
@ -0,0 +1,583 @@
|
||||
use std::collections::HashMap;
|
||||
|
||||
use alloc::{
|
||||
format,
|
||||
string::{String, ToString},
|
||||
vec,
|
||||
vec::Vec,
|
||||
};
|
||||
|
||||
use crate::{
|
||||
packed::{Config, MatchKind},
|
||||
util::search::Match,
|
||||
};
|
||||
|
||||
/// A description of a single test against a multi-pattern searcher.
|
||||
///
|
||||
/// A single test may not necessarily pass on every configuration of a
|
||||
/// searcher. The tests are categorized and grouped appropriately below.
|
||||
#[derive(Clone, Debug, Eq, PartialEq)]
|
||||
struct SearchTest {
|
||||
/// The name of this test, for debugging.
|
||||
name: &'static str,
|
||||
/// The patterns to search for.
|
||||
patterns: &'static [&'static str],
|
||||
/// The text to search.
|
||||
haystack: &'static str,
|
||||
/// Each match is a triple of (pattern_index, start, end), where
|
||||
/// pattern_index is an index into `patterns` and `start`/`end` are indices
|
||||
/// into `haystack`.
|
||||
matches: &'static [(usize, usize, usize)],
|
||||
}
|
||||
|
||||
struct SearchTestOwned {
|
||||
offset: usize,
|
||||
name: String,
|
||||
patterns: Vec<String>,
|
||||
haystack: String,
|
||||
matches: Vec<(usize, usize, usize)>,
|
||||
}
|
||||
|
||||
impl SearchTest {
|
||||
fn variations(&self) -> Vec<SearchTestOwned> {
|
||||
let count = if cfg!(miri) { 1 } else { 261 };
|
||||
let mut tests = vec![];
|
||||
for i in 0..count {
|
||||
tests.push(self.offset_prefix(i));
|
||||
tests.push(self.offset_suffix(i));
|
||||
tests.push(self.offset_both(i));
|
||||
}
|
||||
tests
|
||||
}
|
||||
|
||||
fn offset_both(&self, off: usize) -> SearchTestOwned {
|
||||
SearchTestOwned {
|
||||
offset: off,
|
||||
name: self.name.to_string(),
|
||||
patterns: self.patterns.iter().map(|s| s.to_string()).collect(),
|
||||
haystack: format!(
|
||||
"{}{}{}",
|
||||
"Z".repeat(off),
|
||||
self.haystack,
|
||||
"Z".repeat(off)
|
||||
),
|
||||
matches: self
|
||||
.matches
|
||||
.iter()
|
||||
.map(|&(id, s, e)| (id, s + off, e + off))
|
||||
.collect(),
|
||||
}
|
||||
}
|
||||
|
||||
fn offset_prefix(&self, off: usize) -> SearchTestOwned {
|
||||
SearchTestOwned {
|
||||
offset: off,
|
||||
name: self.name.to_string(),
|
||||
patterns: self.patterns.iter().map(|s| s.to_string()).collect(),
|
||||
haystack: format!("{}{}", "Z".repeat(off), self.haystack),
|
||||
matches: self
|
||||
.matches
|
||||
.iter()
|
||||
.map(|&(id, s, e)| (id, s + off, e + off))
|
||||
.collect(),
|
||||
}
|
||||
}
|
||||
|
||||
fn offset_suffix(&self, off: usize) -> SearchTestOwned {
|
||||
SearchTestOwned {
|
||||
offset: off,
|
||||
name: self.name.to_string(),
|
||||
patterns: self.patterns.iter().map(|s| s.to_string()).collect(),
|
||||
haystack: format!("{}{}", self.haystack, "Z".repeat(off)),
|
||||
matches: self.matches.to_vec(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Short-hand constructor for SearchTest. We use it a lot below.
|
||||
macro_rules! t {
|
||||
($name:ident, $patterns:expr, $haystack:expr, $matches:expr) => {
|
||||
SearchTest {
|
||||
name: stringify!($name),
|
||||
patterns: $patterns,
|
||||
haystack: $haystack,
|
||||
matches: $matches,
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
/// A collection of test groups.
|
||||
type TestCollection = &'static [&'static [SearchTest]];
|
||||
|
||||
// Define several collections corresponding to the different type of match
|
||||
// semantics supported. These collections have some overlap, but each
|
||||
// collection should have some tests that no other collection has.
|
||||
|
||||
/// Tests for leftmost-first match semantics.
|
||||
const PACKED_LEFTMOST_FIRST: TestCollection =
|
||||
&[BASICS, LEFTMOST, LEFTMOST_FIRST, REGRESSION, TEDDY];
|
||||
|
||||
/// Tests for leftmost-longest match semantics.
|
||||
const PACKED_LEFTMOST_LONGEST: TestCollection =
|
||||
&[BASICS, LEFTMOST, LEFTMOST_LONGEST, REGRESSION, TEDDY];
|
||||
|
||||
// Now define the individual tests that make up the collections above.
|
||||
|
||||
/// A collection of tests for the that should always be true regardless of
|
||||
/// match semantics. That is, all combinations of leftmost-{first, longest}
|
||||
/// should produce the same answer.
|
||||
const BASICS: &'static [SearchTest] = &[
|
||||
t!(basic001, &["a"], "", &[]),
|
||||
t!(basic010, &["a"], "a", &[(0, 0, 1)]),
|
||||
t!(basic020, &["a"], "aa", &[(0, 0, 1), (0, 1, 2)]),
|
||||
t!(basic030, &["a"], "aaa", &[(0, 0, 1), (0, 1, 2), (0, 2, 3)]),
|
||||
t!(basic040, &["a"], "aba", &[(0, 0, 1), (0, 2, 3)]),
|
||||
t!(basic050, &["a"], "bba", &[(0, 2, 3)]),
|
||||
t!(basic060, &["a"], "bbb", &[]),
|
||||
t!(basic070, &["a"], "bababbbba", &[(0, 1, 2), (0, 3, 4), (0, 8, 9)]),
|
||||
t!(basic100, &["aa"], "", &[]),
|
||||
t!(basic110, &["aa"], "aa", &[(0, 0, 2)]),
|
||||
t!(basic120, &["aa"], "aabbaa", &[(0, 0, 2), (0, 4, 6)]),
|
||||
t!(basic130, &["aa"], "abbab", &[]),
|
||||
t!(basic140, &["aa"], "abbabaa", &[(0, 5, 7)]),
|
||||
t!(basic150, &["aaa"], "aaa", &[(0, 0, 3)]),
|
||||
t!(basic200, &["abc"], "abc", &[(0, 0, 3)]),
|
||||
t!(basic210, &["abc"], "zazabzabcz", &[(0, 6, 9)]),
|
||||
t!(basic220, &["abc"], "zazabczabcz", &[(0, 3, 6), (0, 7, 10)]),
|
||||
t!(basic230, &["abcd"], "abcd", &[(0, 0, 4)]),
|
||||
t!(basic240, &["abcd"], "zazabzabcdz", &[(0, 6, 10)]),
|
||||
t!(basic250, &["abcd"], "zazabcdzabcdz", &[(0, 3, 7), (0, 8, 12)]),
|
||||
t!(basic300, &["a", "b"], "", &[]),
|
||||
t!(basic310, &["a", "b"], "z", &[]),
|
||||
t!(basic320, &["a", "b"], "b", &[(1, 0, 1)]),
|
||||
t!(basic330, &["a", "b"], "a", &[(0, 0, 1)]),
|
||||
t!(
|
||||
basic340,
|
||||
&["a", "b"],
|
||||
"abba",
|
||||
&[(0, 0, 1), (1, 1, 2), (1, 2, 3), (0, 3, 4),]
|
||||
),
|
||||
t!(
|
||||
basic350,
|
||||
&["b", "a"],
|
||||
"abba",
|
||||
&[(1, 0, 1), (0, 1, 2), (0, 2, 3), (1, 3, 4),]
|
||||
),
|
||||
t!(basic360, &["abc", "bc"], "xbc", &[(1, 1, 3),]),
|
||||
t!(basic400, &["foo", "bar"], "", &[]),
|
||||
t!(basic410, &["foo", "bar"], "foobar", &[(0, 0, 3), (1, 3, 6),]),
|
||||
t!(basic420, &["foo", "bar"], "barfoo", &[(1, 0, 3), (0, 3, 6),]),
|
||||
t!(basic430, &["foo", "bar"], "foofoo", &[(0, 0, 3), (0, 3, 6),]),
|
||||
t!(basic440, &["foo", "bar"], "barbar", &[(1, 0, 3), (1, 3, 6),]),
|
||||
t!(basic450, &["foo", "bar"], "bafofoo", &[(0, 4, 7),]),
|
||||
t!(basic460, &["bar", "foo"], "bafofoo", &[(1, 4, 7),]),
|
||||
t!(basic470, &["foo", "bar"], "fobabar", &[(1, 4, 7),]),
|
||||
t!(basic480, &["bar", "foo"], "fobabar", &[(0, 4, 7),]),
|
||||
t!(basic700, &["yabcdef", "abcdezghi"], "yabcdefghi", &[(0, 0, 7),]),
|
||||
t!(basic710, &["yabcdef", "abcdezghi"], "yabcdezghi", &[(1, 1, 10),]),
|
||||
t!(
|
||||
basic720,
|
||||
&["yabcdef", "bcdeyabc", "abcdezghi"],
|
||||
"yabcdezghi",
|
||||
&[(2, 1, 10),]
|
||||
),
|
||||
t!(basic810, &["abcd", "bcd", "cd"], "abcd", &[(0, 0, 4),]),
|
||||
t!(basic820, &["bcd", "cd", "abcd"], "abcd", &[(2, 0, 4),]),
|
||||
t!(basic830, &["abc", "bc"], "zazabcz", &[(0, 3, 6),]),
|
||||
t!(
|
||||
basic840,
|
||||
&["ab", "ba"],
|
||||
"abababa",
|
||||
&[(0, 0, 2), (0, 2, 4), (0, 4, 6),]
|
||||
),
|
||||
t!(basic850, &["foo", "foo"], "foobarfoo", &[(0, 0, 3), (0, 6, 9),]),
|
||||
];
|
||||
|
||||
/// Tests for leftmost match semantics. These should pass for both
|
||||
/// leftmost-first and leftmost-longest match kinds. Stated differently, among
|
||||
/// ambiguous matches, the longest match and the match that appeared first when
|
||||
/// constructing the automaton should always be the same.
|
||||
const LEFTMOST: &'static [SearchTest] = &[
|
||||
t!(leftmost000, &["ab", "ab"], "abcd", &[(0, 0, 2)]),
|
||||
t!(leftmost030, &["a", "ab"], "aa", &[(0, 0, 1), (0, 1, 2)]),
|
||||
t!(leftmost031, &["ab", "a"], "aa", &[(1, 0, 1), (1, 1, 2)]),
|
||||
t!(leftmost032, &["ab", "a"], "xayabbbz", &[(1, 1, 2), (0, 3, 5)]),
|
||||
t!(leftmost300, &["abcd", "bce", "b"], "abce", &[(1, 1, 4)]),
|
||||
t!(leftmost310, &["abcd", "ce", "bc"], "abce", &[(2, 1, 3)]),
|
||||
t!(leftmost320, &["abcd", "bce", "ce", "b"], "abce", &[(1, 1, 4)]),
|
||||
t!(leftmost330, &["abcd", "bce", "cz", "bc"], "abcz", &[(3, 1, 3)]),
|
||||
t!(leftmost340, &["bce", "cz", "bc"], "bcz", &[(2, 0, 2)]),
|
||||
t!(leftmost350, &["abc", "bd", "ab"], "abd", &[(2, 0, 2)]),
|
||||
t!(
|
||||
leftmost360,
|
||||
&["abcdefghi", "hz", "abcdefgh"],
|
||||
"abcdefghz",
|
||||
&[(2, 0, 8),]
|
||||
),
|
||||
t!(
|
||||
leftmost370,
|
||||
&["abcdefghi", "cde", "hz", "abcdefgh"],
|
||||
"abcdefghz",
|
||||
&[(3, 0, 8),]
|
||||
),
|
||||
t!(
|
||||
leftmost380,
|
||||
&["abcdefghi", "hz", "abcdefgh", "a"],
|
||||
"abcdefghz",
|
||||
&[(2, 0, 8),]
|
||||
),
|
||||
t!(
|
||||
leftmost390,
|
||||
&["b", "abcdefghi", "hz", "abcdefgh"],
|
||||
"abcdefghz",
|
||||
&[(3, 0, 8),]
|
||||
),
|
||||
t!(
|
||||
leftmost400,
|
||||
&["h", "abcdefghi", "hz", "abcdefgh"],
|
||||
"abcdefghz",
|
||||
&[(3, 0, 8),]
|
||||
),
|
||||
t!(
|
||||
leftmost410,
|
||||
&["z", "abcdefghi", "hz", "abcdefgh"],
|
||||
"abcdefghz",
|
||||
&[(3, 0, 8), (0, 8, 9),]
|
||||
),
|
||||
];
|
||||
|
||||
/// Tests for non-overlapping leftmost-first match semantics. These tests
|
||||
/// should generally be specific to leftmost-first, which means they should
|
||||
/// generally fail under leftmost-longest semantics.
|
||||
const LEFTMOST_FIRST: &'static [SearchTest] = &[
|
||||
t!(leftfirst000, &["ab", "abcd"], "abcd", &[(0, 0, 2)]),
|
||||
t!(leftfirst020, &["abcd", "ab"], "abcd", &[(0, 0, 4)]),
|
||||
t!(leftfirst030, &["ab", "ab"], "abcd", &[(0, 0, 2)]),
|
||||
t!(leftfirst040, &["a", "ab"], "xayabbbz", &[(0, 1, 2), (0, 3, 4)]),
|
||||
t!(leftfirst100, &["abcdefg", "bcde", "bcdef"], "abcdef", &[(1, 1, 5)]),
|
||||
t!(leftfirst110, &["abcdefg", "bcdef", "bcde"], "abcdef", &[(1, 1, 6)]),
|
||||
t!(leftfirst300, &["abcd", "b", "bce"], "abce", &[(1, 1, 2)]),
|
||||
t!(
|
||||
leftfirst310,
|
||||
&["abcd", "b", "bce", "ce"],
|
||||
"abce",
|
||||
&[(1, 1, 2), (3, 2, 4),]
|
||||
),
|
||||
t!(
|
||||
leftfirst320,
|
||||
&["a", "abcdefghi", "hz", "abcdefgh"],
|
||||
"abcdefghz",
|
||||
&[(0, 0, 1), (2, 7, 9),]
|
||||
),
|
||||
t!(leftfirst330, &["a", "abab"], "abab", &[(0, 0, 1), (0, 2, 3)]),
|
||||
t!(
|
||||
leftfirst340,
|
||||
&["abcdef", "x", "x", "x", "x", "x", "x", "abcde"],
|
||||
"abcdef",
|
||||
&[(0, 0, 6)]
|
||||
),
|
||||
];
|
||||
|
||||
/// Tests for non-overlapping leftmost-longest match semantics. These tests
|
||||
/// should generally be specific to leftmost-longest, which means they should
|
||||
/// generally fail under leftmost-first semantics.
|
||||
const LEFTMOST_LONGEST: &'static [SearchTest] = &[
|
||||
t!(leftlong000, &["ab", "abcd"], "abcd", &[(1, 0, 4)]),
|
||||
t!(leftlong010, &["abcd", "bcd", "cd", "b"], "abcd", &[(0, 0, 4),]),
|
||||
t!(leftlong040, &["a", "ab"], "a", &[(0, 0, 1)]),
|
||||
t!(leftlong050, &["a", "ab"], "ab", &[(1, 0, 2)]),
|
||||
t!(leftlong060, &["ab", "a"], "a", &[(1, 0, 1)]),
|
||||
t!(leftlong070, &["ab", "a"], "ab", &[(0, 0, 2)]),
|
||||
t!(leftlong100, &["abcdefg", "bcde", "bcdef"], "abcdef", &[(2, 1, 6)]),
|
||||
t!(leftlong110, &["abcdefg", "bcdef", "bcde"], "abcdef", &[(1, 1, 6)]),
|
||||
t!(leftlong300, &["abcd", "b", "bce"], "abce", &[(2, 1, 4)]),
|
||||
t!(
|
||||
leftlong310,
|
||||
&["a", "abcdefghi", "hz", "abcdefgh"],
|
||||
"abcdefghz",
|
||||
&[(3, 0, 8),]
|
||||
),
|
||||
t!(leftlong320, &["a", "abab"], "abab", &[(1, 0, 4)]),
|
||||
t!(leftlong330, &["abcd", "b", "ce"], "abce", &[(1, 1, 2), (2, 2, 4),]),
|
||||
t!(leftlong340, &["a", "ab"], "xayabbbz", &[(0, 1, 2), (1, 3, 5)]),
|
||||
];
|
||||
|
||||
/// Regression tests that are applied to all combinations.
|
||||
///
|
||||
/// If regression tests are needed for specific match semantics, then add them
|
||||
/// to the appropriate group above.
|
||||
const REGRESSION: &'static [SearchTest] = &[
|
||||
t!(regression010, &["inf", "ind"], "infind", &[(0, 0, 3), (1, 3, 6),]),
|
||||
t!(regression020, &["ind", "inf"], "infind", &[(1, 0, 3), (0, 3, 6),]),
|
||||
t!(
|
||||
regression030,
|
||||
&["libcore/", "libstd/"],
|
||||
"libcore/char/methods.rs",
|
||||
&[(0, 0, 8),]
|
||||
),
|
||||
t!(
|
||||
regression040,
|
||||
&["libstd/", "libcore/"],
|
||||
"libcore/char/methods.rs",
|
||||
&[(1, 0, 8),]
|
||||
),
|
||||
t!(
|
||||
regression050,
|
||||
&["\x00\x00\x01", "\x00\x00\x00"],
|
||||
"\x00\x00\x00",
|
||||
&[(1, 0, 3),]
|
||||
),
|
||||
t!(
|
||||
regression060,
|
||||
&["\x00\x00\x00", "\x00\x00\x01"],
|
||||
"\x00\x00\x00",
|
||||
&[(0, 0, 3),]
|
||||
),
|
||||
];
|
||||
|
||||
const TEDDY: &'static [SearchTest] = &[
|
||||
t!(
|
||||
teddy010,
|
||||
&["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"],
|
||||
"abcdefghijk",
|
||||
&[
|
||||
(0, 0, 1),
|
||||
(1, 1, 2),
|
||||
(2, 2, 3),
|
||||
(3, 3, 4),
|
||||
(4, 4, 5),
|
||||
(5, 5, 6),
|
||||
(6, 6, 7),
|
||||
(7, 7, 8),
|
||||
(8, 8, 9),
|
||||
(9, 9, 10),
|
||||
(10, 10, 11)
|
||||
]
|
||||
),
|
||||
t!(
|
||||
teddy020,
|
||||
&["ab", "bc", "cd", "de", "ef", "fg", "gh", "hi", "ij", "jk", "kl"],
|
||||
"abcdefghijk",
|
||||
&[(0, 0, 2), (2, 2, 4), (4, 4, 6), (6, 6, 8), (8, 8, 10),]
|
||||
),
|
||||
t!(
|
||||
teddy030,
|
||||
&["abc"],
|
||||
"abcdefghijklmnopqrstuvwxyzabcdefghijk",
|
||||
&[(0, 0, 3), (0, 26, 29)]
|
||||
),
|
||||
];
|
||||
|
||||
// Now define a test for each combination of things above that we want to run.
|
||||
// Since there are a few different combinations for each collection of tests,
|
||||
// we define a couple of macros to avoid repetition drudgery. The testconfig
|
||||
// macro constructs the automaton from a given match kind, and runs the search
|
||||
// tests one-by-one over the given collection. The `with` parameter allows one
|
||||
// to configure the config with additional parameters. The testcombo macro
|
||||
// invokes testconfig in precisely this way: it sets up several tests where
|
||||
// each one turns a different knob on Config.
|
||||
|
||||
macro_rules! testconfig {
|
||||
($name:ident, $collection:expr, $with:expr) => {
|
||||
#[test]
|
||||
fn $name() {
|
||||
run_search_tests($collection, |test| {
|
||||
let mut config = Config::new();
|
||||
$with(&mut config);
|
||||
let mut builder = config.builder();
|
||||
builder.extend(test.patterns.iter().map(|p| p.as_bytes()));
|
||||
let searcher = match builder.build() {
|
||||
Some(searcher) => searcher,
|
||||
None => {
|
||||
// For x86-64 and aarch64, not building a searcher is
|
||||
// probably a bug, so be loud.
|
||||
if cfg!(any(
|
||||
target_arch = "x86_64",
|
||||
target_arch = "aarch64"
|
||||
)) {
|
||||
panic!("failed to build packed searcher")
|
||||
}
|
||||
return None;
|
||||
}
|
||||
};
|
||||
Some(searcher.find_iter(&test.haystack).collect())
|
||||
});
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
testconfig!(
|
||||
search_default_leftmost_first,
|
||||
PACKED_LEFTMOST_FIRST,
|
||||
|_: &mut Config| {}
|
||||
);
|
||||
|
||||
testconfig!(
|
||||
search_default_leftmost_longest,
|
||||
PACKED_LEFTMOST_LONGEST,
|
||||
|c: &mut Config| {
|
||||
c.match_kind(MatchKind::LeftmostLongest);
|
||||
}
|
||||
);
|
||||
|
||||
testconfig!(
|
||||
search_teddy_leftmost_first,
|
||||
PACKED_LEFTMOST_FIRST,
|
||||
|c: &mut Config| {
|
||||
c.only_teddy(true);
|
||||
}
|
||||
);
|
||||
|
||||
testconfig!(
|
||||
search_teddy_leftmost_longest,
|
||||
PACKED_LEFTMOST_LONGEST,
|
||||
|c: &mut Config| {
|
||||
c.only_teddy(true).match_kind(MatchKind::LeftmostLongest);
|
||||
}
|
||||
);
|
||||
|
||||
testconfig!(
|
||||
search_teddy_ssse3_leftmost_first,
|
||||
PACKED_LEFTMOST_FIRST,
|
||||
|c: &mut Config| {
|
||||
c.only_teddy(true);
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
if std::is_x86_feature_detected!("ssse3") {
|
||||
c.only_teddy_256bit(Some(false));
|
||||
}
|
||||
}
|
||||
);
|
||||
|
||||
testconfig!(
|
||||
search_teddy_ssse3_leftmost_longest,
|
||||
PACKED_LEFTMOST_LONGEST,
|
||||
|c: &mut Config| {
|
||||
c.only_teddy(true).match_kind(MatchKind::LeftmostLongest);
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
if std::is_x86_feature_detected!("ssse3") {
|
||||
c.only_teddy_256bit(Some(false));
|
||||
}
|
||||
}
|
||||
);
|
||||
|
||||
testconfig!(
|
||||
search_teddy_avx2_leftmost_first,
|
||||
PACKED_LEFTMOST_FIRST,
|
||||
|c: &mut Config| {
|
||||
c.only_teddy(true);
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
if std::is_x86_feature_detected!("avx2") {
|
||||
c.only_teddy_256bit(Some(true));
|
||||
}
|
||||
}
|
||||
);
|
||||
|
||||
testconfig!(
|
||||
search_teddy_avx2_leftmost_longest,
|
||||
PACKED_LEFTMOST_LONGEST,
|
||||
|c: &mut Config| {
|
||||
c.only_teddy(true).match_kind(MatchKind::LeftmostLongest);
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
if std::is_x86_feature_detected!("avx2") {
|
||||
c.only_teddy_256bit(Some(true));
|
||||
}
|
||||
}
|
||||
);
|
||||
|
||||
testconfig!(
|
||||
search_teddy_fat_leftmost_first,
|
||||
PACKED_LEFTMOST_FIRST,
|
||||
|c: &mut Config| {
|
||||
c.only_teddy(true);
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
if std::is_x86_feature_detected!("avx2") {
|
||||
c.only_teddy_fat(Some(true));
|
||||
}
|
||||
}
|
||||
);
|
||||
|
||||
testconfig!(
|
||||
search_teddy_fat_leftmost_longest,
|
||||
PACKED_LEFTMOST_LONGEST,
|
||||
|c: &mut Config| {
|
||||
c.only_teddy(true).match_kind(MatchKind::LeftmostLongest);
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
if std::is_x86_feature_detected!("avx2") {
|
||||
c.only_teddy_fat(Some(true));
|
||||
}
|
||||
}
|
||||
);
|
||||
|
||||
testconfig!(
|
||||
search_rabinkarp_leftmost_first,
|
||||
PACKED_LEFTMOST_FIRST,
|
||||
|c: &mut Config| {
|
||||
c.only_rabin_karp(true);
|
||||
}
|
||||
);
|
||||
|
||||
testconfig!(
|
||||
search_rabinkarp_leftmost_longest,
|
||||
PACKED_LEFTMOST_LONGEST,
|
||||
|c: &mut Config| {
|
||||
c.only_rabin_karp(true).match_kind(MatchKind::LeftmostLongest);
|
||||
}
|
||||
);
|
||||
|
||||
#[test]
|
||||
fn search_tests_have_unique_names() {
|
||||
let assert = |constname, tests: &[SearchTest]| {
|
||||
let mut seen = HashMap::new(); // map from test name to position
|
||||
for (i, test) in tests.iter().enumerate() {
|
||||
if !seen.contains_key(test.name) {
|
||||
seen.insert(test.name, i);
|
||||
} else {
|
||||
let last = seen[test.name];
|
||||
panic!(
|
||||
"{} tests have duplicate names at positions {} and {}",
|
||||
constname, last, i
|
||||
);
|
||||
}
|
||||
}
|
||||
};
|
||||
assert("BASICS", BASICS);
|
||||
assert("LEFTMOST", LEFTMOST);
|
||||
assert("LEFTMOST_FIRST", LEFTMOST_FIRST);
|
||||
assert("LEFTMOST_LONGEST", LEFTMOST_LONGEST);
|
||||
assert("REGRESSION", REGRESSION);
|
||||
assert("TEDDY", TEDDY);
|
||||
}
|
||||
|
||||
fn run_search_tests<F: FnMut(&SearchTestOwned) -> Option<Vec<Match>>>(
|
||||
which: TestCollection,
|
||||
mut f: F,
|
||||
) {
|
||||
let get_match_triples =
|
||||
|matches: Vec<Match>| -> Vec<(usize, usize, usize)> {
|
||||
matches
|
||||
.into_iter()
|
||||
.map(|m| (m.pattern().as_usize(), m.start(), m.end()))
|
||||
.collect()
|
||||
};
|
||||
for &tests in which {
|
||||
for spec in tests {
|
||||
for test in spec.variations() {
|
||||
let results = match f(&test) {
|
||||
None => continue,
|
||||
Some(results) => results,
|
||||
};
|
||||
assert_eq!(
|
||||
test.matches,
|
||||
get_match_triples(results).as_slice(),
|
||||
"test: {}, patterns: {:?}, haystack(len={:?}): {:?}, \
|
||||
offset: {:?}",
|
||||
test.name,
|
||||
test.patterns,
|
||||
test.haystack.len(),
|
||||
test.haystack,
|
||||
test.offset,
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
1752
vendor/aho-corasick/src/packed/vector.rs
vendored
Normal file
1752
vendor/aho-corasick/src/packed/vector.rs
vendored
Normal file
File diff suppressed because it is too large
Load Diff
1664
vendor/aho-corasick/src/tests.rs
vendored
Normal file
1664
vendor/aho-corasick/src/tests.rs
vendored
Normal file
File diff suppressed because it is too large
Load Diff
270
vendor/aho-corasick/src/transducer.rs
vendored
Normal file
270
vendor/aho-corasick/src/transducer.rs
vendored
Normal file
@ -0,0 +1,270 @@
|
||||
/*!
|
||||
Provides implementations of `fst::Automaton` for Aho-Corasick automata.
|
||||
|
||||
This works by providing two wrapper types, [`Anchored`] and [`Unanchored`].
|
||||
The former executes an anchored search on an FST while the latter executes
|
||||
an unanchored search. Building these wrappers is fallible and will fail if
|
||||
the underlying Aho-Corasick automaton does not support the type of search it
|
||||
represents.
|
||||
*/
|
||||
|
||||
use crate::{
|
||||
automaton::{Automaton, StateID},
|
||||
Anchored as AcAnchored, Input, MatchError,
|
||||
};
|
||||
|
||||
/// Represents an unanchored Aho-Corasick search of a finite state transducer.
|
||||
///
|
||||
/// Wrapping an Aho-Corasick automaton in `Unanchored` will fail if the
|
||||
/// underlying automaton does not support unanchored searches.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// This shows how to build an FST of keys and then run an unanchored search on
|
||||
/// those keys using an Aho-Corasick automaton.
|
||||
///
|
||||
/// ```
|
||||
/// use aho_corasick::{nfa::contiguous::NFA, transducer::Unanchored};
|
||||
/// use fst::{Automaton, IntoStreamer, Set, Streamer};
|
||||
///
|
||||
/// let set = Set::from_iter(&["abcd", "bc", "bcd", "xyz"]).unwrap();
|
||||
/// let nfa = NFA::new(&["bcd", "x"]).unwrap();
|
||||
/// // NFAs always support both unanchored and anchored searches.
|
||||
/// let searcher = Unanchored::new(&nfa).unwrap();
|
||||
///
|
||||
/// let mut stream = set.search(searcher).into_stream();
|
||||
/// let mut results = vec![];
|
||||
/// while let Some(key) = stream.next() {
|
||||
/// results.push(std::str::from_utf8(key).unwrap().to_string());
|
||||
/// }
|
||||
/// assert_eq!(vec!["abcd", "bcd", "xyz"], results);
|
||||
/// ```
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct Unanchored<A>(A);
|
||||
|
||||
impl<A: Automaton> Unanchored<A> {
|
||||
/// Create a new `Unanchored` implementation of the `fst::Automaton` trait.
|
||||
///
|
||||
/// If the given Aho-Corasick automaton does not support unanchored
|
||||
/// searches, then this returns an error.
|
||||
pub fn new(aut: A) -> Result<Unanchored<A>, MatchError> {
|
||||
let input = Input::new("").anchored(AcAnchored::No);
|
||||
let _ = aut.start_state(&input)?;
|
||||
Ok(Unanchored(aut))
|
||||
}
|
||||
|
||||
/// Returns a borrow to the underlying automaton.
|
||||
pub fn as_ref(&self) -> &A {
|
||||
&self.0
|
||||
}
|
||||
|
||||
/// Unwrap this value and return the inner automaton.
|
||||
pub fn into_inner(self) -> A {
|
||||
self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl<A: Automaton> fst::Automaton for Unanchored<A> {
|
||||
type State = StateID;
|
||||
|
||||
#[inline]
|
||||
fn start(&self) -> StateID {
|
||||
let input = Input::new("").anchored(AcAnchored::No);
|
||||
self.0.start_state(&input).expect("support for unanchored searches")
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn is_match(&self, state: &StateID) -> bool {
|
||||
self.0.is_match(*state)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn accept(&self, state: &StateID, byte: u8) -> StateID {
|
||||
if fst::Automaton::is_match(self, state) {
|
||||
return *state;
|
||||
}
|
||||
self.0.next_state(AcAnchored::No, *state, byte)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn can_match(&self, state: &StateID) -> bool {
|
||||
!self.0.is_dead(*state)
|
||||
}
|
||||
}
|
||||
|
||||
/// Represents an anchored Aho-Corasick search of a finite state transducer.
|
||||
///
|
||||
/// Wrapping an Aho-Corasick automaton in `Unanchored` will fail if the
|
||||
/// underlying automaton does not support unanchored searches.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// This shows how to build an FST of keys and then run an anchored search on
|
||||
/// those keys using an Aho-Corasick automaton.
|
||||
///
|
||||
/// ```
|
||||
/// use aho_corasick::{nfa::contiguous::NFA, transducer::Anchored};
|
||||
/// use fst::{Automaton, IntoStreamer, Set, Streamer};
|
||||
///
|
||||
/// let set = Set::from_iter(&["abcd", "bc", "bcd", "xyz"]).unwrap();
|
||||
/// let nfa = NFA::new(&["bcd", "x"]).unwrap();
|
||||
/// // NFAs always support both unanchored and anchored searches.
|
||||
/// let searcher = Anchored::new(&nfa).unwrap();
|
||||
///
|
||||
/// let mut stream = set.search(searcher).into_stream();
|
||||
/// let mut results = vec![];
|
||||
/// while let Some(key) = stream.next() {
|
||||
/// results.push(std::str::from_utf8(key).unwrap().to_string());
|
||||
/// }
|
||||
/// assert_eq!(vec!["bcd", "xyz"], results);
|
||||
/// ```
|
||||
///
|
||||
/// This is like the example above, except we use an Aho-Corasick DFA, which
|
||||
/// requires explicitly configuring it to support anchored searches. (NFAs
|
||||
/// unconditionally support both unanchored and anchored searches.)
|
||||
///
|
||||
/// ```
|
||||
/// use aho_corasick::{dfa::DFA, transducer::Anchored, StartKind};
|
||||
/// use fst::{Automaton, IntoStreamer, Set, Streamer};
|
||||
///
|
||||
/// let set = Set::from_iter(&["abcd", "bc", "bcd", "xyz"]).unwrap();
|
||||
/// let dfa = DFA::builder()
|
||||
/// .start_kind(StartKind::Anchored)
|
||||
/// .build(&["bcd", "x"])
|
||||
/// .unwrap();
|
||||
/// // We've explicitly configured our DFA to support anchored searches.
|
||||
/// let searcher = Anchored::new(&dfa).unwrap();
|
||||
///
|
||||
/// let mut stream = set.search(searcher).into_stream();
|
||||
/// let mut results = vec![];
|
||||
/// while let Some(key) = stream.next() {
|
||||
/// results.push(std::str::from_utf8(key).unwrap().to_string());
|
||||
/// }
|
||||
/// assert_eq!(vec!["bcd", "xyz"], results);
|
||||
/// ```
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct Anchored<A>(A);
|
||||
|
||||
impl<A: Automaton> Anchored<A> {
|
||||
/// Create a new `Anchored` implementation of the `fst::Automaton` trait.
|
||||
///
|
||||
/// If the given Aho-Corasick automaton does not support anchored searches,
|
||||
/// then this returns an error.
|
||||
pub fn new(aut: A) -> Result<Anchored<A>, MatchError> {
|
||||
let input = Input::new("").anchored(AcAnchored::Yes);
|
||||
let _ = aut.start_state(&input)?;
|
||||
Ok(Anchored(aut))
|
||||
}
|
||||
|
||||
/// Returns a borrow to the underlying automaton.
|
||||
pub fn as_ref(&self) -> &A {
|
||||
&self.0
|
||||
}
|
||||
|
||||
/// Unwrap this value and return the inner automaton.
|
||||
pub fn into_inner(self) -> A {
|
||||
self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl<A: Automaton> fst::Automaton for Anchored<A> {
|
||||
type State = StateID;
|
||||
|
||||
#[inline]
|
||||
fn start(&self) -> StateID {
|
||||
let input = Input::new("").anchored(AcAnchored::Yes);
|
||||
self.0.start_state(&input).expect("support for unanchored searches")
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn is_match(&self, state: &StateID) -> bool {
|
||||
self.0.is_match(*state)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn accept(&self, state: &StateID, byte: u8) -> StateID {
|
||||
if fst::Automaton::is_match(self, state) {
|
||||
return *state;
|
||||
}
|
||||
self.0.next_state(AcAnchored::Yes, *state, byte)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn can_match(&self, state: &StateID) -> bool {
|
||||
!self.0.is_dead(*state)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use alloc::{string::String, vec, vec::Vec};
|
||||
|
||||
use fst::{Automaton, IntoStreamer, Set, Streamer};
|
||||
|
||||
use crate::{
|
||||
dfa::DFA,
|
||||
nfa::{contiguous, noncontiguous},
|
||||
StartKind,
|
||||
};
|
||||
|
||||
use super::*;
|
||||
|
||||
fn search<A: Automaton, D: AsRef<[u8]>>(
|
||||
set: &Set<D>,
|
||||
aut: A,
|
||||
) -> Vec<String> {
|
||||
let mut stream = set.search(aut).into_stream();
|
||||
let mut results = vec![];
|
||||
while let Some(key) = stream.next() {
|
||||
results.push(String::from(core::str::from_utf8(key).unwrap()));
|
||||
}
|
||||
results
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn unanchored() {
|
||||
let set =
|
||||
Set::from_iter(&["a", "bar", "baz", "wat", "xba", "xbax", "z"])
|
||||
.unwrap();
|
||||
let patterns = vec!["baz", "bax"];
|
||||
let expected = vec!["baz", "xbax"];
|
||||
|
||||
let aut = Unanchored(noncontiguous::NFA::new(&patterns).unwrap());
|
||||
let got = search(&set, &aut);
|
||||
assert_eq!(got, expected);
|
||||
|
||||
let aut = Unanchored(contiguous::NFA::new(&patterns).unwrap());
|
||||
let got = search(&set, &aut);
|
||||
assert_eq!(got, expected);
|
||||
|
||||
let aut = Unanchored(DFA::new(&patterns).unwrap());
|
||||
let got = search(&set, &aut);
|
||||
assert_eq!(got, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn anchored() {
|
||||
let set =
|
||||
Set::from_iter(&["a", "bar", "baz", "wat", "xba", "xbax", "z"])
|
||||
.unwrap();
|
||||
let patterns = vec!["baz", "bax"];
|
||||
let expected = vec!["baz"];
|
||||
|
||||
let aut = Anchored(noncontiguous::NFA::new(&patterns).unwrap());
|
||||
let got = search(&set, &aut);
|
||||
assert_eq!(got, expected);
|
||||
|
||||
let aut = Anchored(contiguous::NFA::new(&patterns).unwrap());
|
||||
let got = search(&set, &aut);
|
||||
assert_eq!(got, expected);
|
||||
|
||||
let aut = Anchored(
|
||||
DFA::builder()
|
||||
.start_kind(StartKind::Anchored)
|
||||
.build(&patterns)
|
||||
.unwrap(),
|
||||
);
|
||||
let got = search(&set, &aut);
|
||||
assert_eq!(got, expected);
|
||||
}
|
||||
}
|
409
vendor/aho-corasick/src/util/alphabet.rs
vendored
Normal file
409
vendor/aho-corasick/src/util/alphabet.rs
vendored
Normal file
@ -0,0 +1,409 @@
|
||||
use crate::util::int::Usize;
|
||||
|
||||
/// A representation of byte oriented equivalence classes.
|
||||
///
|
||||
/// This is used in finite state machines to reduce the size of the transition
|
||||
/// table. This can have a particularly large impact not only on the total size
|
||||
/// of an FSM, but also on FSM build times because it reduces the number of
|
||||
/// transitions that need to be visited/set.
|
||||
#[derive(Clone, Copy)]
|
||||
pub(crate) struct ByteClasses([u8; 256]);
|
||||
|
||||
impl ByteClasses {
|
||||
/// Creates a new set of equivalence classes where all bytes are mapped to
|
||||
/// the same class.
|
||||
pub(crate) fn empty() -> ByteClasses {
|
||||
ByteClasses([0; 256])
|
||||
}
|
||||
|
||||
/// Creates a new set of equivalence classes where each byte belongs to
|
||||
/// its own equivalence class.
|
||||
pub(crate) fn singletons() -> ByteClasses {
|
||||
let mut classes = ByteClasses::empty();
|
||||
for b in 0..=255 {
|
||||
classes.set(b, b);
|
||||
}
|
||||
classes
|
||||
}
|
||||
|
||||
/// Set the equivalence class for the given byte.
|
||||
#[inline]
|
||||
pub(crate) fn set(&mut self, byte: u8, class: u8) {
|
||||
self.0[usize::from(byte)] = class;
|
||||
}
|
||||
|
||||
/// Get the equivalence class for the given byte.
|
||||
#[inline]
|
||||
pub(crate) fn get(&self, byte: u8) -> u8 {
|
||||
self.0[usize::from(byte)]
|
||||
}
|
||||
|
||||
/// Return the total number of elements in the alphabet represented by
|
||||
/// these equivalence classes. Equivalently, this returns the total number
|
||||
/// of equivalence classes.
|
||||
#[inline]
|
||||
pub(crate) fn alphabet_len(&self) -> usize {
|
||||
// Add one since the number of equivalence classes is one bigger than
|
||||
// the last one.
|
||||
usize::from(self.0[255]) + 1
|
||||
}
|
||||
|
||||
/// Returns the stride, as a base-2 exponent, required for these
|
||||
/// equivalence classes.
|
||||
///
|
||||
/// The stride is always the smallest power of 2 that is greater than or
|
||||
/// equal to the alphabet length. This is done so that converting between
|
||||
/// state IDs and indices can be done with shifts alone, which is much
|
||||
/// faster than integer division. The "stride2" is the exponent. i.e.,
|
||||
/// `2^stride2 = stride`.
|
||||
pub(crate) fn stride2(&self) -> usize {
|
||||
let zeros = self.alphabet_len().next_power_of_two().trailing_zeros();
|
||||
usize::try_from(zeros).unwrap()
|
||||
}
|
||||
|
||||
/// Returns the stride for these equivalence classes, which corresponds
|
||||
/// to the smallest power of 2 greater than or equal to the number of
|
||||
/// equivalence classes.
|
||||
pub(crate) fn stride(&self) -> usize {
|
||||
1 << self.stride2()
|
||||
}
|
||||
|
||||
/// Returns true if and only if every byte in this class maps to its own
|
||||
/// equivalence class. Equivalently, there are 257 equivalence classes
|
||||
/// and each class contains exactly one byte (plus the special EOI class).
|
||||
#[inline]
|
||||
pub(crate) fn is_singleton(&self) -> bool {
|
||||
self.alphabet_len() == 256
|
||||
}
|
||||
|
||||
/// Returns an iterator over all equivalence classes in this set.
|
||||
pub(crate) fn iter(&self) -> ByteClassIter {
|
||||
ByteClassIter { it: 0..self.alphabet_len() }
|
||||
}
|
||||
|
||||
/// Returns an iterator of the bytes in the given equivalence class.
|
||||
pub(crate) fn elements(&self, class: u8) -> ByteClassElements {
|
||||
ByteClassElements { classes: self, class, bytes: 0..=255 }
|
||||
}
|
||||
|
||||
/// Returns an iterator of byte ranges in the given equivalence class.
|
||||
///
|
||||
/// That is, a sequence of contiguous ranges are returned. Typically, every
|
||||
/// class maps to a single contiguous range.
|
||||
fn element_ranges(&self, class: u8) -> ByteClassElementRanges {
|
||||
ByteClassElementRanges { elements: self.elements(class), range: None }
|
||||
}
|
||||
}
|
||||
|
||||
impl core::fmt::Debug for ByteClasses {
|
||||
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
|
||||
if self.is_singleton() {
|
||||
write!(f, "ByteClasses(<one-class-per-byte>)")
|
||||
} else {
|
||||
write!(f, "ByteClasses(")?;
|
||||
for (i, class) in self.iter().enumerate() {
|
||||
if i > 0 {
|
||||
write!(f, ", ")?;
|
||||
}
|
||||
write!(f, "{:?} => [", class)?;
|
||||
for (start, end) in self.element_ranges(class) {
|
||||
if start == end {
|
||||
write!(f, "{:?}", start)?;
|
||||
} else {
|
||||
write!(f, "{:?}-{:?}", start, end)?;
|
||||
}
|
||||
}
|
||||
write!(f, "]")?;
|
||||
}
|
||||
write!(f, ")")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// An iterator over each equivalence class.
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct ByteClassIter {
|
||||
it: core::ops::Range<usize>,
|
||||
}
|
||||
|
||||
impl Iterator for ByteClassIter {
|
||||
type Item = u8;
|
||||
|
||||
fn next(&mut self) -> Option<u8> {
|
||||
self.it.next().map(|class| class.as_u8())
|
||||
}
|
||||
}
|
||||
|
||||
/// An iterator over all elements in a specific equivalence class.
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct ByteClassElements<'a> {
|
||||
classes: &'a ByteClasses,
|
||||
class: u8,
|
||||
bytes: core::ops::RangeInclusive<u8>,
|
||||
}
|
||||
|
||||
impl<'a> Iterator for ByteClassElements<'a> {
|
||||
type Item = u8;
|
||||
|
||||
fn next(&mut self) -> Option<u8> {
|
||||
while let Some(byte) = self.bytes.next() {
|
||||
if self.class == self.classes.get(byte) {
|
||||
return Some(byte);
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// An iterator over all elements in an equivalence class expressed as a
|
||||
/// sequence of contiguous ranges.
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct ByteClassElementRanges<'a> {
|
||||
elements: ByteClassElements<'a>,
|
||||
range: Option<(u8, u8)>,
|
||||
}
|
||||
|
||||
impl<'a> Iterator for ByteClassElementRanges<'a> {
|
||||
type Item = (u8, u8);
|
||||
|
||||
fn next(&mut self) -> Option<(u8, u8)> {
|
||||
loop {
|
||||
let element = match self.elements.next() {
|
||||
None => return self.range.take(),
|
||||
Some(element) => element,
|
||||
};
|
||||
match self.range.take() {
|
||||
None => {
|
||||
self.range = Some((element, element));
|
||||
}
|
||||
Some((start, end)) => {
|
||||
if usize::from(end) + 1 != usize::from(element) {
|
||||
self.range = Some((element, element));
|
||||
return Some((start, end));
|
||||
}
|
||||
self.range = Some((start, element));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// A partitioning of bytes into equivalence classes.
|
||||
///
|
||||
/// A byte class set keeps track of an *approximation* of equivalence classes
|
||||
/// of bytes during NFA construction. That is, every byte in an equivalence
|
||||
/// class cannot discriminate between a match and a non-match.
|
||||
///
|
||||
/// Note that this may not compute the minimal set of equivalence classes.
|
||||
/// Basically, any byte in a pattern given to the noncontiguous NFA builder
|
||||
/// will automatically be treated as its own equivalence class. All other
|
||||
/// bytes---any byte not in any pattern---will be treated as their own
|
||||
/// equivalence classes. In theory, all bytes not in any pattern should
|
||||
/// be part of a single equivalence class, but in practice, we only treat
|
||||
/// contiguous ranges of bytes as an equivalence class. So the number of
|
||||
/// classes computed may be bigger than necessary. This usually doesn't make
|
||||
/// much of a difference, and keeps the implementation simple.
|
||||
#[derive(Clone, Debug)]
|
||||
pub(crate) struct ByteClassSet(ByteSet);
|
||||
|
||||
impl Default for ByteClassSet {
|
||||
fn default() -> ByteClassSet {
|
||||
ByteClassSet::empty()
|
||||
}
|
||||
}
|
||||
|
||||
impl ByteClassSet {
|
||||
/// Create a new set of byte classes where all bytes are part of the same
|
||||
/// equivalence class.
|
||||
pub(crate) fn empty() -> Self {
|
||||
ByteClassSet(ByteSet::empty())
|
||||
}
|
||||
|
||||
/// Indicate the the range of byte given (inclusive) can discriminate a
|
||||
/// match between it and all other bytes outside of the range.
|
||||
pub(crate) fn set_range(&mut self, start: u8, end: u8) {
|
||||
debug_assert!(start <= end);
|
||||
if start > 0 {
|
||||
self.0.add(start - 1);
|
||||
}
|
||||
self.0.add(end);
|
||||
}
|
||||
|
||||
/// Convert this boolean set to a map that maps all byte values to their
|
||||
/// corresponding equivalence class. The last mapping indicates the largest
|
||||
/// equivalence class identifier (which is never bigger than 255).
|
||||
pub(crate) fn byte_classes(&self) -> ByteClasses {
|
||||
let mut classes = ByteClasses::empty();
|
||||
let mut class = 0u8;
|
||||
let mut b = 0u8;
|
||||
loop {
|
||||
classes.set(b, class);
|
||||
if b == 255 {
|
||||
break;
|
||||
}
|
||||
if self.0.contains(b) {
|
||||
class = class.checked_add(1).unwrap();
|
||||
}
|
||||
b = b.checked_add(1).unwrap();
|
||||
}
|
||||
classes
|
||||
}
|
||||
}
|
||||
|
||||
/// A simple set of bytes that is reasonably cheap to copy and allocation free.
|
||||
#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
|
||||
pub(crate) struct ByteSet {
|
||||
bits: BitSet,
|
||||
}
|
||||
|
||||
/// The representation of a byte set. Split out so that we can define a
|
||||
/// convenient Debug impl for it while keeping "ByteSet" in the output.
|
||||
#[derive(Clone, Copy, Default, Eq, PartialEq)]
|
||||
struct BitSet([u128; 2]);
|
||||
|
||||
impl ByteSet {
|
||||
/// Create an empty set of bytes.
|
||||
pub(crate) fn empty() -> ByteSet {
|
||||
ByteSet { bits: BitSet([0; 2]) }
|
||||
}
|
||||
|
||||
/// Add a byte to this set.
|
||||
///
|
||||
/// If the given byte already belongs to this set, then this is a no-op.
|
||||
pub(crate) fn add(&mut self, byte: u8) {
|
||||
let bucket = byte / 128;
|
||||
let bit = byte % 128;
|
||||
self.bits.0[usize::from(bucket)] |= 1 << bit;
|
||||
}
|
||||
|
||||
/// Return true if and only if the given byte is in this set.
|
||||
pub(crate) fn contains(&self, byte: u8) -> bool {
|
||||
let bucket = byte / 128;
|
||||
let bit = byte % 128;
|
||||
self.bits.0[usize::from(bucket)] & (1 << bit) > 0
|
||||
}
|
||||
}
|
||||
|
||||
impl core::fmt::Debug for BitSet {
|
||||
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
|
||||
let mut fmtd = f.debug_set();
|
||||
for b in 0u8..=255 {
|
||||
if (ByteSet { bits: *self }).contains(b) {
|
||||
fmtd.entry(&b);
|
||||
}
|
||||
}
|
||||
fmtd.finish()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use alloc::{vec, vec::Vec};
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn byte_classes() {
|
||||
let mut set = ByteClassSet::empty();
|
||||
set.set_range(b'a', b'z');
|
||||
|
||||
let classes = set.byte_classes();
|
||||
assert_eq!(classes.get(0), 0);
|
||||
assert_eq!(classes.get(1), 0);
|
||||
assert_eq!(classes.get(2), 0);
|
||||
assert_eq!(classes.get(b'a' - 1), 0);
|
||||
assert_eq!(classes.get(b'a'), 1);
|
||||
assert_eq!(classes.get(b'm'), 1);
|
||||
assert_eq!(classes.get(b'z'), 1);
|
||||
assert_eq!(classes.get(b'z' + 1), 2);
|
||||
assert_eq!(classes.get(254), 2);
|
||||
assert_eq!(classes.get(255), 2);
|
||||
|
||||
let mut set = ByteClassSet::empty();
|
||||
set.set_range(0, 2);
|
||||
set.set_range(4, 6);
|
||||
let classes = set.byte_classes();
|
||||
assert_eq!(classes.get(0), 0);
|
||||
assert_eq!(classes.get(1), 0);
|
||||
assert_eq!(classes.get(2), 0);
|
||||
assert_eq!(classes.get(3), 1);
|
||||
assert_eq!(classes.get(4), 2);
|
||||
assert_eq!(classes.get(5), 2);
|
||||
assert_eq!(classes.get(6), 2);
|
||||
assert_eq!(classes.get(7), 3);
|
||||
assert_eq!(classes.get(255), 3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn full_byte_classes() {
|
||||
let mut set = ByteClassSet::empty();
|
||||
for b in 0u8..=255 {
|
||||
set.set_range(b, b);
|
||||
}
|
||||
assert_eq!(set.byte_classes().alphabet_len(), 256);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn elements_typical() {
|
||||
let mut set = ByteClassSet::empty();
|
||||
set.set_range(b'b', b'd');
|
||||
set.set_range(b'g', b'm');
|
||||
set.set_range(b'z', b'z');
|
||||
let classes = set.byte_classes();
|
||||
// class 0: \x00-a
|
||||
// class 1: b-d
|
||||
// class 2: e-f
|
||||
// class 3: g-m
|
||||
// class 4: n-y
|
||||
// class 5: z-z
|
||||
// class 6: \x7B-\xFF
|
||||
assert_eq!(classes.alphabet_len(), 7);
|
||||
|
||||
let elements = classes.elements(0).collect::<Vec<_>>();
|
||||
assert_eq!(elements.len(), 98);
|
||||
assert_eq!(elements[0], b'\x00');
|
||||
assert_eq!(elements[97], b'a');
|
||||
|
||||
let elements = classes.elements(1).collect::<Vec<_>>();
|
||||
assert_eq!(elements, vec![b'b', b'c', b'd'],);
|
||||
|
||||
let elements = classes.elements(2).collect::<Vec<_>>();
|
||||
assert_eq!(elements, vec![b'e', b'f'],);
|
||||
|
||||
let elements = classes.elements(3).collect::<Vec<_>>();
|
||||
assert_eq!(elements, vec![b'g', b'h', b'i', b'j', b'k', b'l', b'm',],);
|
||||
|
||||
let elements = classes.elements(4).collect::<Vec<_>>();
|
||||
assert_eq!(elements.len(), 12);
|
||||
assert_eq!(elements[0], b'n');
|
||||
assert_eq!(elements[11], b'y');
|
||||
|
||||
let elements = classes.elements(5).collect::<Vec<_>>();
|
||||
assert_eq!(elements, vec![b'z']);
|
||||
|
||||
let elements = classes.elements(6).collect::<Vec<_>>();
|
||||
assert_eq!(elements.len(), 133);
|
||||
assert_eq!(elements[0], b'\x7B');
|
||||
assert_eq!(elements[132], b'\xFF');
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn elements_singletons() {
|
||||
let classes = ByteClasses::singletons();
|
||||
assert_eq!(classes.alphabet_len(), 256);
|
||||
|
||||
let elements = classes.elements(b'a').collect::<Vec<_>>();
|
||||
assert_eq!(elements, vec![b'a']);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn elements_empty() {
|
||||
let classes = ByteClasses::empty();
|
||||
assert_eq!(classes.alphabet_len(), 1);
|
||||
|
||||
let elements = classes.elements(0).collect::<Vec<_>>();
|
||||
assert_eq!(elements.len(), 256);
|
||||
assert_eq!(elements[0], b'\x00');
|
||||
assert_eq!(elements[255], b'\xFF');
|
||||
}
|
||||
}
|
124
vendor/aho-corasick/src/util/buffer.rs
vendored
Normal file
124
vendor/aho-corasick/src/util/buffer.rs
vendored
Normal file
@ -0,0 +1,124 @@
|
||||
use alloc::{vec, vec::Vec};
|
||||
|
||||
/// The default buffer capacity that we use for the stream buffer.
|
||||
const DEFAULT_BUFFER_CAPACITY: usize = 64 * (1 << 10); // 64 KB
|
||||
|
||||
/// A fairly simple roll buffer for supporting stream searches.
|
||||
///
|
||||
/// This buffer acts as a temporary place to store a fixed amount of data when
|
||||
/// reading from a stream. Its central purpose is to allow "rolling" some
|
||||
/// suffix of the data to the beginning of the buffer before refilling it with
|
||||
/// more data from the stream. For example, let's say we are trying to match
|
||||
/// "foobar" on a stream. When we report the match, we'd like to not only
|
||||
/// report the correct offsets at which the match occurs, but also the matching
|
||||
/// bytes themselves. So let's say our stream is a file with the following
|
||||
/// contents: `test test foobar test test`. Now assume that we happen to read
|
||||
/// the aforementioned file in two chunks: `test test foo` and `bar test test`.
|
||||
/// Naively, it would not be possible to report a single contiguous `foobar`
|
||||
/// match, but this roll buffer allows us to do that. Namely, after the second
|
||||
/// read, the contents of the buffer should be `st foobar test test`, where the
|
||||
/// search should ultimately resume immediately after `foo`. (The prefix `st `
|
||||
/// is included because the roll buffer saves N bytes at the end of the buffer,
|
||||
/// where N is the maximum possible length of a match.)
|
||||
///
|
||||
/// A lot of the logic for dealing with this is unfortunately split out between
|
||||
/// this roll buffer and the `StreamChunkIter`.
|
||||
///
|
||||
/// Note also that this buffer is not actually required to just report matches.
|
||||
/// Because a `Match` is just some offsets. But it *is* required for supporting
|
||||
/// things like `try_stream_replace_all` because that needs some mechanism for
|
||||
/// knowing which bytes in the stream correspond to a match and which don't. So
|
||||
/// when a match occurs across two `read` calls, *something* needs to retain
|
||||
/// the bytes from the previous `read` call because you don't know before the
|
||||
/// second read call whether a match exists or not.
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct Buffer {
|
||||
/// The raw buffer contents. This has a fixed size and never increases.
|
||||
buf: Vec<u8>,
|
||||
/// The minimum size of the buffer, which is equivalent to the maximum
|
||||
/// possible length of a match. This corresponds to the amount that we
|
||||
/// roll
|
||||
min: usize,
|
||||
/// The end of the contents of this buffer.
|
||||
end: usize,
|
||||
}
|
||||
|
||||
impl Buffer {
|
||||
/// Create a new buffer for stream searching. The minimum buffer length
|
||||
/// given should be the size of the maximum possible match length.
|
||||
pub(crate) fn new(min_buffer_len: usize) -> Buffer {
|
||||
let min = core::cmp::max(1, min_buffer_len);
|
||||
// The minimum buffer amount is also the amount that we roll our
|
||||
// buffer in order to support incremental searching. To this end,
|
||||
// our actual capacity needs to be at least 1 byte bigger than our
|
||||
// minimum amount, otherwise we won't have any overlap. In actuality,
|
||||
// we want our buffer to be a bit bigger than that for performance
|
||||
// reasons, so we set a lower bound of `8 * min`.
|
||||
//
|
||||
// TODO: It would be good to find a way to test the streaming
|
||||
// implementation with the minimal buffer size. For now, we just
|
||||
// uncomment out the next line and comment out the subsequent line.
|
||||
// let capacity = 1 + min;
|
||||
let capacity = core::cmp::max(min * 8, DEFAULT_BUFFER_CAPACITY);
|
||||
Buffer { buf: vec![0; capacity], min, end: 0 }
|
||||
}
|
||||
|
||||
/// Return the contents of this buffer.
|
||||
#[inline]
|
||||
pub(crate) fn buffer(&self) -> &[u8] {
|
||||
&self.buf[..self.end]
|
||||
}
|
||||
|
||||
/// Return the minimum size of the buffer. The only way a buffer may be
|
||||
/// smaller than this is if the stream itself contains less than the
|
||||
/// minimum buffer amount.
|
||||
#[inline]
|
||||
pub(crate) fn min_buffer_len(&self) -> usize {
|
||||
self.min
|
||||
}
|
||||
|
||||
/// Return all free capacity in this buffer.
|
||||
fn free_buffer(&mut self) -> &mut [u8] {
|
||||
&mut self.buf[self.end..]
|
||||
}
|
||||
|
||||
/// Refill the contents of this buffer by reading as much as possible into
|
||||
/// this buffer's free capacity. If no more bytes could be read, then this
|
||||
/// returns false. Otherwise, this reads until it has filled the buffer
|
||||
/// past the minimum amount.
|
||||
pub(crate) fn fill<R: std::io::Read>(
|
||||
&mut self,
|
||||
mut rdr: R,
|
||||
) -> std::io::Result<bool> {
|
||||
let mut readany = false;
|
||||
loop {
|
||||
let readlen = rdr.read(self.free_buffer())?;
|
||||
if readlen == 0 {
|
||||
return Ok(readany);
|
||||
}
|
||||
readany = true;
|
||||
self.end += readlen;
|
||||
if self.buffer().len() >= self.min {
|
||||
return Ok(true);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Roll the contents of the buffer so that the suffix of this buffer is
|
||||
/// moved to the front and all other contents are dropped. The size of the
|
||||
/// suffix corresponds precisely to the minimum buffer length.
|
||||
///
|
||||
/// This should only be called when the entire contents of this buffer have
|
||||
/// been searched.
|
||||
pub(crate) fn roll(&mut self) {
|
||||
let roll_start = self
|
||||
.end
|
||||
.checked_sub(self.min)
|
||||
.expect("buffer capacity should be bigger than minimum amount");
|
||||
let roll_end = roll_start + self.min;
|
||||
|
||||
assert!(roll_end <= self.end);
|
||||
self.buf.copy_within(roll_start..roll_end, 0);
|
||||
self.end = self.min;
|
||||
}
|
||||
}
|
258
vendor/aho-corasick/src/util/byte_frequencies.rs
vendored
Normal file
258
vendor/aho-corasick/src/util/byte_frequencies.rs
vendored
Normal file
@ -0,0 +1,258 @@
|
||||
pub const BYTE_FREQUENCIES: [u8; 256] = [
|
||||
55, // '\x00'
|
||||
52, // '\x01'
|
||||
51, // '\x02'
|
||||
50, // '\x03'
|
||||
49, // '\x04'
|
||||
48, // '\x05'
|
||||
47, // '\x06'
|
||||
46, // '\x07'
|
||||
45, // '\x08'
|
||||
103, // '\t'
|
||||
242, // '\n'
|
||||
66, // '\x0b'
|
||||
67, // '\x0c'
|
||||
229, // '\r'
|
||||
44, // '\x0e'
|
||||
43, // '\x0f'
|
||||
42, // '\x10'
|
||||
41, // '\x11'
|
||||
40, // '\x12'
|
||||
39, // '\x13'
|
||||
38, // '\x14'
|
||||
37, // '\x15'
|
||||
36, // '\x16'
|
||||
35, // '\x17'
|
||||
34, // '\x18'
|
||||
33, // '\x19'
|
||||
56, // '\x1a'
|
||||
32, // '\x1b'
|
||||
31, // '\x1c'
|
||||
30, // '\x1d'
|
||||
29, // '\x1e'
|
||||
28, // '\x1f'
|
||||
255, // ' '
|
||||
148, // '!'
|
||||
164, // '"'
|
||||
149, // '#'
|
||||
136, // '$'
|
||||
160, // '%'
|
||||
155, // '&'
|
||||
173, // "'"
|
||||
221, // '('
|
||||
222, // ')'
|
||||
134, // '*'
|
||||
122, // '+'
|
||||
232, // ','
|
||||
202, // '-'
|
||||
215, // '.'
|
||||
224, // '/'
|
||||
208, // '0'
|
||||
220, // '1'
|
||||
204, // '2'
|
||||
187, // '3'
|
||||
183, // '4'
|
||||
179, // '5'
|
||||
177, // '6'
|
||||
168, // '7'
|
||||
178, // '8'
|
||||
200, // '9'
|
||||
226, // ':'
|
||||
195, // ';'
|
||||
154, // '<'
|
||||
184, // '='
|
||||
174, // '>'
|
||||
126, // '?'
|
||||
120, // '@'
|
||||
191, // 'A'
|
||||
157, // 'B'
|
||||
194, // 'C'
|
||||
170, // 'D'
|
||||
189, // 'E'
|
||||
162, // 'F'
|
||||
161, // 'G'
|
||||
150, // 'H'
|
||||
193, // 'I'
|
||||
142, // 'J'
|
||||
137, // 'K'
|
||||
171, // 'L'
|
||||
176, // 'M'
|
||||
185, // 'N'
|
||||
167, // 'O'
|
||||
186, // 'P'
|
||||
112, // 'Q'
|
||||
175, // 'R'
|
||||
192, // 'S'
|
||||
188, // 'T'
|
||||
156, // 'U'
|
||||
140, // 'V'
|
||||
143, // 'W'
|
||||
123, // 'X'
|
||||
133, // 'Y'
|
||||
128, // 'Z'
|
||||
147, // '['
|
||||
138, // '\\'
|
||||
146, // ']'
|
||||
114, // '^'
|
||||
223, // '_'
|
||||
151, // '`'
|
||||
249, // 'a'
|
||||
216, // 'b'
|
||||
238, // 'c'
|
||||
236, // 'd'
|
||||
253, // 'e'
|
||||
227, // 'f'
|
||||
218, // 'g'
|
||||
230, // 'h'
|
||||
247, // 'i'
|
||||
135, // 'j'
|
||||
180, // 'k'
|
||||
241, // 'l'
|
||||
233, // 'm'
|
||||
246, // 'n'
|
||||
244, // 'o'
|
||||
231, // 'p'
|
||||
139, // 'q'
|
||||
245, // 'r'
|
||||
243, // 's'
|
||||
251, // 't'
|
||||
235, // 'u'
|
||||
201, // 'v'
|
||||
196, // 'w'
|
||||
240, // 'x'
|
||||
214, // 'y'
|
||||
152, // 'z'
|
||||
182, // '{'
|
||||
205, // '|'
|
||||
181, // '}'
|
||||
127, // '~'
|
||||
27, // '\x7f'
|
||||
212, // '\x80'
|
||||
211, // '\x81'
|
||||
210, // '\x82'
|
||||
213, // '\x83'
|
||||
228, // '\x84'
|
||||
197, // '\x85'
|
||||
169, // '\x86'
|
||||
159, // '\x87'
|
||||
131, // '\x88'
|
||||
172, // '\x89'
|
||||
105, // '\x8a'
|
||||
80, // '\x8b'
|
||||
98, // '\x8c'
|
||||
96, // '\x8d'
|
||||
97, // '\x8e'
|
||||
81, // '\x8f'
|
||||
207, // '\x90'
|
||||
145, // '\x91'
|
||||
116, // '\x92'
|
||||
115, // '\x93'
|
||||
144, // '\x94'
|
||||
130, // '\x95'
|
||||
153, // '\x96'
|
||||
121, // '\x97'
|
||||
107, // '\x98'
|
||||
132, // '\x99'
|
||||
109, // '\x9a'
|
||||
110, // '\x9b'
|
||||
124, // '\x9c'
|
||||
111, // '\x9d'
|
||||
82, // '\x9e'
|
||||
108, // '\x9f'
|
||||
118, // '\xa0'
|
||||
141, // '¡'
|
||||
113, // '¢'
|
||||
129, // '£'
|
||||
119, // '¤'
|
||||
125, // '¥'
|
||||
165, // '¦'
|
||||
117, // '§'
|
||||
92, // '¨'
|
||||
106, // '©'
|
||||
83, // 'ª'
|
||||
72, // '«'
|
||||
99, // '¬'
|
||||
93, // '\xad'
|
||||
65, // '®'
|
||||
79, // '¯'
|
||||
166, // '°'
|
||||
237, // '±'
|
||||
163, // '²'
|
||||
199, // '³'
|
||||
190, // '´'
|
||||
225, // 'µ'
|
||||
209, // '¶'
|
||||
203, // '·'
|
||||
198, // '¸'
|
||||
217, // '¹'
|
||||
219, // 'º'
|
||||
206, // '»'
|
||||
234, // '¼'
|
||||
248, // '½'
|
||||
158, // '¾'
|
||||
239, // '¿'
|
||||
255, // 'À'
|
||||
255, // 'Á'
|
||||
255, // 'Â'
|
||||
255, // 'Ã'
|
||||
255, // 'Ä'
|
||||
255, // 'Å'
|
||||
255, // 'Æ'
|
||||
255, // 'Ç'
|
||||
255, // 'È'
|
||||
255, // 'É'
|
||||
255, // 'Ê'
|
||||
255, // 'Ë'
|
||||
255, // 'Ì'
|
||||
255, // 'Í'
|
||||
255, // 'Î'
|
||||
255, // 'Ï'
|
||||
255, // 'Ð'
|
||||
255, // 'Ñ'
|
||||
255, // 'Ò'
|
||||
255, // 'Ó'
|
||||
255, // 'Ô'
|
||||
255, // 'Õ'
|
||||
255, // 'Ö'
|
||||
255, // '×'
|
||||
255, // 'Ø'
|
||||
255, // 'Ù'
|
||||
255, // 'Ú'
|
||||
255, // 'Û'
|
||||
255, // 'Ü'
|
||||
255, // 'Ý'
|
||||
255, // 'Þ'
|
||||
255, // 'ß'
|
||||
255, // 'à'
|
||||
255, // 'á'
|
||||
255, // 'â'
|
||||
255, // 'ã'
|
||||
255, // 'ä'
|
||||
255, // 'å'
|
||||
255, // 'æ'
|
||||
255, // 'ç'
|
||||
255, // 'è'
|
||||
255, // 'é'
|
||||
255, // 'ê'
|
||||
255, // 'ë'
|
||||
255, // 'ì'
|
||||
255, // 'í'
|
||||
255, // 'î'
|
||||
255, // 'ï'
|
||||
255, // 'ð'
|
||||
255, // 'ñ'
|
||||
255, // 'ò'
|
||||
255, // 'ó'
|
||||
255, // 'ô'
|
||||
255, // 'õ'
|
||||
255, // 'ö'
|
||||
255, // '÷'
|
||||
255, // 'ø'
|
||||
255, // 'ù'
|
||||
255, // 'ú'
|
||||
255, // 'û'
|
||||
255, // 'ü'
|
||||
255, // 'ý'
|
||||
255, // 'þ'
|
||||
255, // 'ÿ'
|
||||
];
|
26
vendor/aho-corasick/src/util/debug.rs
vendored
Normal file
26
vendor/aho-corasick/src/util/debug.rs
vendored
Normal file
@ -0,0 +1,26 @@
|
||||
/// A type that wraps a single byte with a convenient fmt::Debug impl that
|
||||
/// escapes the byte.
|
||||
pub(crate) struct DebugByte(pub(crate) u8);
|
||||
|
||||
impl core::fmt::Debug for DebugByte {
|
||||
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
|
||||
// Special case ASCII space. It's too hard to read otherwise, so
|
||||
// put quotes around it. I sometimes wonder whether just '\x20' would
|
||||
// be better...
|
||||
if self.0 == b' ' {
|
||||
return write!(f, "' '");
|
||||
}
|
||||
// 10 bytes is enough to cover any output from ascii::escape_default.
|
||||
let mut bytes = [0u8; 10];
|
||||
let mut len = 0;
|
||||
for (i, mut b) in core::ascii::escape_default(self.0).enumerate() {
|
||||
// capitalize \xab to \xAB
|
||||
if i >= 2 && b'a' <= b && b <= b'f' {
|
||||
b -= 32;
|
||||
}
|
||||
bytes[len] = b;
|
||||
len += 1;
|
||||
}
|
||||
write!(f, "{}", core::str::from_utf8(&bytes[..len]).unwrap())
|
||||
}
|
||||
}
|
259
vendor/aho-corasick/src/util/error.rs
vendored
Normal file
259
vendor/aho-corasick/src/util/error.rs
vendored
Normal file
@ -0,0 +1,259 @@
|
||||
use crate::util::{
|
||||
primitives::{PatternID, SmallIndex},
|
||||
search::MatchKind,
|
||||
};
|
||||
|
||||
/// An error that occurred during the construction of an Aho-Corasick
|
||||
/// automaton.
|
||||
///
|
||||
/// Build errors occur when some kind of limit has been exceeded, either in the
|
||||
/// number of states, the number of patterns of the length of a pattern. These
|
||||
/// limits aren't part of the public API, but they should generally be large
|
||||
/// enough to handle most use cases.
|
||||
///
|
||||
/// When the `std` feature is enabled, this implements the `std::error::Error`
|
||||
/// trait.
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct BuildError {
|
||||
kind: ErrorKind,
|
||||
}
|
||||
|
||||
/// The kind of error that occurred.
|
||||
#[derive(Clone, Debug)]
|
||||
enum ErrorKind {
|
||||
/// An error that occurs when allocating a new state would result in an
|
||||
/// identifier that exceeds the capacity of a `StateID`.
|
||||
StateIDOverflow {
|
||||
/// The maximum possible id.
|
||||
max: u64,
|
||||
/// The maximum ID requested.
|
||||
requested_max: u64,
|
||||
},
|
||||
/// An error that occurs when adding a pattern to an Aho-Corasick
|
||||
/// automaton would result in an identifier that exceeds the capacity of a
|
||||
/// `PatternID`.
|
||||
PatternIDOverflow {
|
||||
/// The maximum possible id.
|
||||
max: u64,
|
||||
/// The maximum ID requested.
|
||||
requested_max: u64,
|
||||
},
|
||||
/// Occurs when a pattern string is given to the Aho-Corasick constructor
|
||||
/// that is too long.
|
||||
PatternTooLong {
|
||||
/// The ID of the pattern that was too long.
|
||||
pattern: PatternID,
|
||||
/// The length that was too long.
|
||||
len: usize,
|
||||
},
|
||||
}
|
||||
|
||||
impl BuildError {
|
||||
pub(crate) fn state_id_overflow(
|
||||
max: u64,
|
||||
requested_max: u64,
|
||||
) -> BuildError {
|
||||
BuildError { kind: ErrorKind::StateIDOverflow { max, requested_max } }
|
||||
}
|
||||
|
||||
pub(crate) fn pattern_id_overflow(
|
||||
max: u64,
|
||||
requested_max: u64,
|
||||
) -> BuildError {
|
||||
BuildError {
|
||||
kind: ErrorKind::PatternIDOverflow { max, requested_max },
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn pattern_too_long(
|
||||
pattern: PatternID,
|
||||
len: usize,
|
||||
) -> BuildError {
|
||||
BuildError { kind: ErrorKind::PatternTooLong { pattern, len } }
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "std")]
|
||||
impl std::error::Error for BuildError {}
|
||||
|
||||
impl core::fmt::Display for BuildError {
|
||||
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
|
||||
match self.kind {
|
||||
ErrorKind::StateIDOverflow { max, requested_max } => {
|
||||
write!(
|
||||
f,
|
||||
"state identifier overflow: failed to create state ID \
|
||||
from {}, which exceeds the max of {}",
|
||||
requested_max, max,
|
||||
)
|
||||
}
|
||||
ErrorKind::PatternIDOverflow { max, requested_max } => {
|
||||
write!(
|
||||
f,
|
||||
"pattern identifier overflow: failed to create pattern ID \
|
||||
from {}, which exceeds the max of {}",
|
||||
requested_max, max,
|
||||
)
|
||||
}
|
||||
ErrorKind::PatternTooLong { pattern, len } => {
|
||||
write!(
|
||||
f,
|
||||
"pattern {} with length {} exceeds \
|
||||
the maximum pattern length of {}",
|
||||
pattern.as_usize(),
|
||||
len,
|
||||
SmallIndex::MAX.as_usize(),
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// An error that occurred during an Aho-Corasick search.
|
||||
///
|
||||
/// An error that occurs during a search is limited to some kind of
|
||||
/// misconfiguration that resulted in an illegal call. Stated differently,
|
||||
/// whether an error occurs is not dependent on the specific bytes in the
|
||||
/// haystack.
|
||||
///
|
||||
/// Examples of misconfiguration:
|
||||
///
|
||||
/// * Executing a stream or overlapping search on a searcher that was built was
|
||||
/// something other than [`MatchKind::Standard`](crate::MatchKind::Standard)
|
||||
/// semantics.
|
||||
/// * Requested an anchored or an unanchored search on a searcher that doesn't
|
||||
/// support unanchored or anchored searches, respectively.
|
||||
///
|
||||
/// When the `std` feature is enabled, this implements the `std::error::Error`
|
||||
/// trait.
|
||||
#[derive(Clone, Debug, Eq, PartialEq)]
|
||||
pub struct MatchError(alloc::boxed::Box<MatchErrorKind>);
|
||||
|
||||
impl MatchError {
|
||||
/// Create a new error value with the given kind.
|
||||
///
|
||||
/// This is a more verbose version of the kind-specific constructors, e.g.,
|
||||
/// `MatchError::unsupported_stream`.
|
||||
pub fn new(kind: MatchErrorKind) -> MatchError {
|
||||
MatchError(alloc::boxed::Box::new(kind))
|
||||
}
|
||||
|
||||
/// Returns a reference to the underlying error kind.
|
||||
pub fn kind(&self) -> &MatchErrorKind {
|
||||
&self.0
|
||||
}
|
||||
|
||||
/// Create a new "invalid anchored search" error. This occurs when the
|
||||
/// caller requests an anchored search but where anchored searches aren't
|
||||
/// supported.
|
||||
///
|
||||
/// This is the same as calling `MatchError::new` with a
|
||||
/// [`MatchErrorKind::InvalidInputAnchored`] kind.
|
||||
pub fn invalid_input_anchored() -> MatchError {
|
||||
MatchError::new(MatchErrorKind::InvalidInputAnchored)
|
||||
}
|
||||
|
||||
/// Create a new "invalid unanchored search" error. This occurs when the
|
||||
/// caller requests an unanchored search but where unanchored searches
|
||||
/// aren't supported.
|
||||
///
|
||||
/// This is the same as calling `MatchError::new` with a
|
||||
/// [`MatchErrorKind::InvalidInputUnanchored`] kind.
|
||||
pub fn invalid_input_unanchored() -> MatchError {
|
||||
MatchError::new(MatchErrorKind::InvalidInputUnanchored)
|
||||
}
|
||||
|
||||
/// Create a new "unsupported stream search" error. This occurs when the
|
||||
/// caller requests a stream search while using an Aho-Corasick automaton
|
||||
/// with a match kind other than [`MatchKind::Standard`].
|
||||
///
|
||||
/// The match kind given should be the match kind of the automaton. It
|
||||
/// should never be `MatchKind::Standard`.
|
||||
pub fn unsupported_stream(got: MatchKind) -> MatchError {
|
||||
MatchError::new(MatchErrorKind::UnsupportedStream { got })
|
||||
}
|
||||
|
||||
/// Create a new "unsupported overlapping search" error. This occurs when
|
||||
/// the caller requests an overlapping search while using an Aho-Corasick
|
||||
/// automaton with a match kind other than [`MatchKind::Standard`].
|
||||
///
|
||||
/// The match kind given should be the match kind of the automaton. It
|
||||
/// should never be `MatchKind::Standard`.
|
||||
pub fn unsupported_overlapping(got: MatchKind) -> MatchError {
|
||||
MatchError::new(MatchErrorKind::UnsupportedOverlapping { got })
|
||||
}
|
||||
|
||||
/// Create a new "unsupported empty pattern" error. This occurs when the
|
||||
/// caller requests a search for which matching an automaton that contains
|
||||
/// an empty pattern string is not supported.
|
||||
pub fn unsupported_empty() -> MatchError {
|
||||
MatchError::new(MatchErrorKind::UnsupportedEmpty)
|
||||
}
|
||||
}
|
||||
|
||||
/// The underlying kind of a [`MatchError`].
|
||||
///
|
||||
/// This is a **non-exhaustive** enum. That means new variants may be added in
|
||||
/// a semver-compatible release.
|
||||
#[non_exhaustive]
|
||||
#[derive(Clone, Debug, Eq, PartialEq)]
|
||||
pub enum MatchErrorKind {
|
||||
/// An error indicating that an anchored search was requested, but from a
|
||||
/// searcher that was built without anchored support.
|
||||
InvalidInputAnchored,
|
||||
/// An error indicating that an unanchored search was requested, but from a
|
||||
/// searcher that was built without unanchored support.
|
||||
InvalidInputUnanchored,
|
||||
/// An error indicating that a stream search was attempted on an
|
||||
/// Aho-Corasick automaton with an unsupported `MatchKind`.
|
||||
UnsupportedStream {
|
||||
/// The match semantics for the automaton that was used.
|
||||
got: MatchKind,
|
||||
},
|
||||
/// An error indicating that an overlapping search was attempted on an
|
||||
/// Aho-Corasick automaton with an unsupported `MatchKind`.
|
||||
UnsupportedOverlapping {
|
||||
/// The match semantics for the automaton that was used.
|
||||
got: MatchKind,
|
||||
},
|
||||
/// An error indicating that the operation requested doesn't support
|
||||
/// automatons that contain an empty pattern string.
|
||||
UnsupportedEmpty,
|
||||
}
|
||||
|
||||
#[cfg(feature = "std")]
|
||||
impl std::error::Error for MatchError {}
|
||||
|
||||
impl core::fmt::Display for MatchError {
|
||||
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
|
||||
match *self.kind() {
|
||||
MatchErrorKind::InvalidInputAnchored => {
|
||||
write!(f, "anchored searches are not supported or enabled")
|
||||
}
|
||||
MatchErrorKind::InvalidInputUnanchored => {
|
||||
write!(f, "unanchored searches are not supported or enabled")
|
||||
}
|
||||
MatchErrorKind::UnsupportedStream { got } => {
|
||||
write!(
|
||||
f,
|
||||
"match kind {:?} does not support stream searching",
|
||||
got,
|
||||
)
|
||||
}
|
||||
MatchErrorKind::UnsupportedOverlapping { got } => {
|
||||
write!(
|
||||
f,
|
||||
"match kind {:?} does not support overlapping searches",
|
||||
got,
|
||||
)
|
||||
}
|
||||
MatchErrorKind::UnsupportedEmpty => {
|
||||
write!(
|
||||
f,
|
||||
"matching with an empty pattern string is not \
|
||||
supported for this operation",
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
284
vendor/aho-corasick/src/util/int.rs
vendored
Normal file
284
vendor/aho-corasick/src/util/int.rs
vendored
Normal file
@ -0,0 +1,284 @@
|
||||
/*!
|
||||
This module provides several integer oriented traits for converting between
|
||||
both fixed size integers and integers whose size varies based on the target
|
||||
(like `usize`).
|
||||
|
||||
The main design principle for this module is to centralize all uses of `as`.
|
||||
The thinking here is that `as` makes it very easy to perform accidental lossy
|
||||
conversions, and if we centralize all its uses here under more descriptive
|
||||
higher level operations, its use and correctness becomes easier to audit.
|
||||
|
||||
This was copied mostly wholesale from `regex-automata`.
|
||||
|
||||
NOTE: for simplicity, we don't take target pointer width into account here for
|
||||
`usize` conversions. Since we currently only panic in debug mode, skipping the
|
||||
check when it can be proven it isn't needed at compile time doesn't really
|
||||
matter. Now, if we wind up wanting to do as many checks as possible in release
|
||||
mode, then we would want to skip those when we know the conversions are always
|
||||
non-lossy.
|
||||
*/
|
||||
|
||||
pub(crate) trait U8 {
|
||||
fn as_usize(self) -> usize;
|
||||
}
|
||||
|
||||
impl U8 for u8 {
|
||||
fn as_usize(self) -> usize {
|
||||
usize::from(self)
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) trait U16 {
|
||||
fn as_usize(self) -> usize;
|
||||
fn low_u8(self) -> u8;
|
||||
fn high_u8(self) -> u8;
|
||||
}
|
||||
|
||||
impl U16 for u16 {
|
||||
fn as_usize(self) -> usize {
|
||||
usize::from(self)
|
||||
}
|
||||
|
||||
fn low_u8(self) -> u8 {
|
||||
self as u8
|
||||
}
|
||||
|
||||
fn high_u8(self) -> u8 {
|
||||
(self >> 8) as u8
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) trait U32 {
|
||||
fn as_usize(self) -> usize;
|
||||
fn low_u8(self) -> u8;
|
||||
fn low_u16(self) -> u16;
|
||||
fn high_u16(self) -> u16;
|
||||
}
|
||||
|
||||
impl U32 for u32 {
|
||||
#[inline]
|
||||
fn as_usize(self) -> usize {
|
||||
#[cfg(debug_assertions)]
|
||||
{
|
||||
usize::try_from(self).expect("u32 overflowed usize")
|
||||
}
|
||||
#[cfg(not(debug_assertions))]
|
||||
{
|
||||
self as usize
|
||||
}
|
||||
}
|
||||
|
||||
fn low_u8(self) -> u8 {
|
||||
self as u8
|
||||
}
|
||||
|
||||
fn low_u16(self) -> u16 {
|
||||
self as u16
|
||||
}
|
||||
|
||||
fn high_u16(self) -> u16 {
|
||||
(self >> 16) as u16
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) trait U64 {
|
||||
fn as_usize(self) -> usize;
|
||||
fn low_u8(self) -> u8;
|
||||
fn low_u16(self) -> u16;
|
||||
fn low_u32(self) -> u32;
|
||||
fn high_u32(self) -> u32;
|
||||
}
|
||||
|
||||
impl U64 for u64 {
|
||||
fn as_usize(self) -> usize {
|
||||
#[cfg(debug_assertions)]
|
||||
{
|
||||
usize::try_from(self).expect("u64 overflowed usize")
|
||||
}
|
||||
#[cfg(not(debug_assertions))]
|
||||
{
|
||||
self as usize
|
||||
}
|
||||
}
|
||||
|
||||
fn low_u8(self) -> u8 {
|
||||
self as u8
|
||||
}
|
||||
|
||||
fn low_u16(self) -> u16 {
|
||||
self as u16
|
||||
}
|
||||
|
||||
fn low_u32(self) -> u32 {
|
||||
self as u32
|
||||
}
|
||||
|
||||
fn high_u32(self) -> u32 {
|
||||
(self >> 32) as u32
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) trait I8 {
|
||||
fn as_usize(self) -> usize;
|
||||
fn to_bits(self) -> u8;
|
||||
fn from_bits(n: u8) -> i8;
|
||||
}
|
||||
|
||||
impl I8 for i8 {
|
||||
fn as_usize(self) -> usize {
|
||||
#[cfg(debug_assertions)]
|
||||
{
|
||||
usize::try_from(self).expect("i8 overflowed usize")
|
||||
}
|
||||
#[cfg(not(debug_assertions))]
|
||||
{
|
||||
self as usize
|
||||
}
|
||||
}
|
||||
|
||||
fn to_bits(self) -> u8 {
|
||||
self as u8
|
||||
}
|
||||
|
||||
fn from_bits(n: u8) -> i8 {
|
||||
n as i8
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) trait I32 {
|
||||
fn as_usize(self) -> usize;
|
||||
fn to_bits(self) -> u32;
|
||||
fn from_bits(n: u32) -> i32;
|
||||
}
|
||||
|
||||
impl I32 for i32 {
|
||||
fn as_usize(self) -> usize {
|
||||
#[cfg(debug_assertions)]
|
||||
{
|
||||
usize::try_from(self).expect("i32 overflowed usize")
|
||||
}
|
||||
#[cfg(not(debug_assertions))]
|
||||
{
|
||||
self as usize
|
||||
}
|
||||
}
|
||||
|
||||
fn to_bits(self) -> u32 {
|
||||
self as u32
|
||||
}
|
||||
|
||||
fn from_bits(n: u32) -> i32 {
|
||||
n as i32
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) trait I64 {
|
||||
fn as_usize(self) -> usize;
|
||||
fn to_bits(self) -> u64;
|
||||
fn from_bits(n: u64) -> i64;
|
||||
}
|
||||
|
||||
impl I64 for i64 {
|
||||
fn as_usize(self) -> usize {
|
||||
#[cfg(debug_assertions)]
|
||||
{
|
||||
usize::try_from(self).expect("i64 overflowed usize")
|
||||
}
|
||||
#[cfg(not(debug_assertions))]
|
||||
{
|
||||
self as usize
|
||||
}
|
||||
}
|
||||
|
||||
fn to_bits(self) -> u64 {
|
||||
self as u64
|
||||
}
|
||||
|
||||
fn from_bits(n: u64) -> i64 {
|
||||
n as i64
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) trait Usize {
|
||||
fn as_u8(self) -> u8;
|
||||
fn as_u16(self) -> u16;
|
||||
fn as_u32(self) -> u32;
|
||||
fn as_u64(self) -> u64;
|
||||
}
|
||||
|
||||
impl Usize for usize {
|
||||
fn as_u8(self) -> u8 {
|
||||
#[cfg(debug_assertions)]
|
||||
{
|
||||
u8::try_from(self).expect("usize overflowed u8")
|
||||
}
|
||||
#[cfg(not(debug_assertions))]
|
||||
{
|
||||
self as u8
|
||||
}
|
||||
}
|
||||
|
||||
fn as_u16(self) -> u16 {
|
||||
#[cfg(debug_assertions)]
|
||||
{
|
||||
u16::try_from(self).expect("usize overflowed u16")
|
||||
}
|
||||
#[cfg(not(debug_assertions))]
|
||||
{
|
||||
self as u16
|
||||
}
|
||||
}
|
||||
|
||||
fn as_u32(self) -> u32 {
|
||||
#[cfg(debug_assertions)]
|
||||
{
|
||||
u32::try_from(self).expect("usize overflowed u32")
|
||||
}
|
||||
#[cfg(not(debug_assertions))]
|
||||
{
|
||||
self as u32
|
||||
}
|
||||
}
|
||||
|
||||
fn as_u64(self) -> u64 {
|
||||
#[cfg(debug_assertions)]
|
||||
{
|
||||
u64::try_from(self).expect("usize overflowed u64")
|
||||
}
|
||||
#[cfg(not(debug_assertions))]
|
||||
{
|
||||
self as u64
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Pointers aren't integers, but we convert pointers to integers to perform
|
||||
// offset arithmetic in some places. (And no, we don't convert the integers
|
||||
// back to pointers.) So add 'as_usize' conversions here too for completeness.
|
||||
//
|
||||
// These 'as' casts are actually okay because they're always non-lossy. But the
|
||||
// idea here is to just try and remove as much 'as' as possible, particularly
|
||||
// in this crate where we are being really paranoid about offsets and making
|
||||
// sure we don't panic on inputs that might be untrusted. This way, the 'as'
|
||||
// casts become easier to audit if they're all in one place, even when some of
|
||||
// them are actually okay 100% of the time.
|
||||
|
||||
pub(crate) trait Pointer {
|
||||
fn as_usize(self) -> usize;
|
||||
}
|
||||
|
||||
impl<T> Pointer for *const T {
|
||||
fn as_usize(self) -> usize {
|
||||
self as usize
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) trait PointerMut {
|
||||
fn as_usize(self) -> usize;
|
||||
}
|
||||
|
||||
impl<T> PointerMut for *mut T {
|
||||
fn as_usize(self) -> usize {
|
||||
self as usize
|
||||
}
|
||||
}
|
12
vendor/aho-corasick/src/util/mod.rs
vendored
Normal file
12
vendor/aho-corasick/src/util/mod.rs
vendored
Normal file
@ -0,0 +1,12 @@
|
||||
pub(crate) mod alphabet;
|
||||
#[cfg(feature = "std")]
|
||||
pub(crate) mod buffer;
|
||||
pub(crate) mod byte_frequencies;
|
||||
pub(crate) mod debug;
|
||||
pub(crate) mod error;
|
||||
pub(crate) mod int;
|
||||
pub(crate) mod prefilter;
|
||||
pub(crate) mod primitives;
|
||||
pub(crate) mod remapper;
|
||||
pub(crate) mod search;
|
||||
pub(crate) mod special;
|
924
vendor/aho-corasick/src/util/prefilter.rs
vendored
Normal file
924
vendor/aho-corasick/src/util/prefilter.rs
vendored
Normal file
@ -0,0 +1,924 @@
|
||||
use core::{
|
||||
cmp,
|
||||
fmt::Debug,
|
||||
panic::{RefUnwindSafe, UnwindSafe},
|
||||
u8,
|
||||
};
|
||||
|
||||
use alloc::{sync::Arc, vec, vec::Vec};
|
||||
|
||||
use crate::{
|
||||
packed,
|
||||
util::{
|
||||
alphabet::ByteSet,
|
||||
search::{Match, MatchKind, Span},
|
||||
},
|
||||
};
|
||||
|
||||
/// A prefilter for accelerating a search.
|
||||
///
|
||||
/// This crate uses prefilters in the core search implementations to accelerate
|
||||
/// common cases. They typically only apply to cases where there are a small
|
||||
/// number of patterns (less than 100 or so), but when they do, thoughput can
|
||||
/// be boosted considerably, perhaps by an order of magnitude. When a prefilter
|
||||
/// is active, it is used whenever a search enters an automaton's start state.
|
||||
///
|
||||
/// Currently, prefilters cannot be constructed by
|
||||
/// callers. A `Prefilter` can only be accessed via the
|
||||
/// [`Automaton::prefilter`](crate::automaton::Automaton::prefilter)
|
||||
/// method and used to execute a search. In other words, a prefilter can be
|
||||
/// used to optimize your own search implementation if necessary, but cannot do
|
||||
/// much else. If you have a use case for more APIs, please submit an issue.
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct Prefilter {
|
||||
finder: Arc<dyn PrefilterI>,
|
||||
memory_usage: usize,
|
||||
}
|
||||
|
||||
impl Prefilter {
|
||||
/// Execute a search in the haystack within the span given. If a match or
|
||||
/// a possible match is returned, then it is guaranteed to occur within
|
||||
/// the bounds of the span.
|
||||
///
|
||||
/// If the span provided is invalid for the given haystack, then behavior
|
||||
/// is unspecified.
|
||||
#[inline]
|
||||
pub fn find_in(&self, haystack: &[u8], span: Span) -> Candidate {
|
||||
self.finder.find_in(haystack, span)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub(crate) fn memory_usage(&self) -> usize {
|
||||
self.memory_usage
|
||||
}
|
||||
}
|
||||
|
||||
/// A candidate is the result of running a prefilter on a haystack at a
|
||||
/// particular position.
|
||||
///
|
||||
/// The result is either no match, a confirmed match or a possible match.
|
||||
///
|
||||
/// When no match is returned, the prefilter is guaranteeing that no possible
|
||||
/// match can be found in the haystack, and the caller may trust this. That is,
|
||||
/// all correct prefilters must never report false negatives.
|
||||
///
|
||||
/// In some cases, a prefilter can confirm a match very quickly, in which case,
|
||||
/// the caller may use this to stop what it's doing and report the match. In
|
||||
/// this case, prefilter implementations must never report a false positive.
|
||||
/// In other cases, the prefilter can only report a potential match, in which
|
||||
/// case the callers must attempt to confirm the match. In this case, prefilter
|
||||
/// implementations are permitted to return false positives.
|
||||
#[derive(Clone, Debug)]
|
||||
pub enum Candidate {
|
||||
/// No match was found. Since false negatives are not possible, this means
|
||||
/// the search can quit as it is guaranteed not to find another match.
|
||||
None,
|
||||
/// A confirmed match was found. Callers do not need to confirm it.
|
||||
Match(Match),
|
||||
/// The start of a possible match was found. Callers must confirm it before
|
||||
/// reporting it as a match.
|
||||
PossibleStartOfMatch(usize),
|
||||
}
|
||||
|
||||
impl Candidate {
|
||||
/// Convert this candidate into an option. This is useful when callers
|
||||
/// do not distinguish between true positives and false positives (i.e.,
|
||||
/// the caller must always confirm the match).
|
||||
pub fn into_option(self) -> Option<usize> {
|
||||
match self {
|
||||
Candidate::None => None,
|
||||
Candidate::Match(ref m) => Some(m.start()),
|
||||
Candidate::PossibleStartOfMatch(start) => Some(start),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// A prefilter describes the behavior of fast literal scanners for quickly
|
||||
/// skipping past bytes in the haystack that we know cannot possibly
|
||||
/// participate in a match.
|
||||
trait PrefilterI:
|
||||
Send + Sync + RefUnwindSafe + UnwindSafe + Debug + 'static
|
||||
{
|
||||
/// Returns the next possible match candidate. This may yield false
|
||||
/// positives, so callers must confirm a match starting at the position
|
||||
/// returned. This, however, must never produce false negatives. That is,
|
||||
/// this must, at minimum, return the starting position of the next match
|
||||
/// in the given haystack after or at the given position.
|
||||
fn find_in(&self, haystack: &[u8], span: Span) -> Candidate;
|
||||
}
|
||||
|
||||
impl<P: PrefilterI + ?Sized> PrefilterI for Arc<P> {
|
||||
#[inline(always)]
|
||||
fn find_in(&self, haystack: &[u8], span: Span) -> Candidate {
|
||||
(**self).find_in(haystack, span)
|
||||
}
|
||||
}
|
||||
|
||||
/// A builder for constructing the best possible prefilter. When constructed,
|
||||
/// this builder will heuristically select the best prefilter it can build,
|
||||
/// if any, and discard the rest.
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct Builder {
|
||||
count: usize,
|
||||
ascii_case_insensitive: bool,
|
||||
start_bytes: StartBytesBuilder,
|
||||
rare_bytes: RareBytesBuilder,
|
||||
memmem: MemmemBuilder,
|
||||
packed: Option<packed::Builder>,
|
||||
// If we run across a condition that suggests we shouldn't use a prefilter
|
||||
// at all (like an empty pattern), then disable prefilters entirely.
|
||||
enabled: bool,
|
||||
}
|
||||
|
||||
impl Builder {
|
||||
/// Create a new builder for constructing the best possible prefilter.
|
||||
pub(crate) fn new(kind: MatchKind) -> Builder {
|
||||
let pbuilder = kind
|
||||
.as_packed()
|
||||
.map(|kind| packed::Config::new().match_kind(kind).builder());
|
||||
Builder {
|
||||
count: 0,
|
||||
ascii_case_insensitive: false,
|
||||
start_bytes: StartBytesBuilder::new(),
|
||||
rare_bytes: RareBytesBuilder::new(),
|
||||
memmem: MemmemBuilder::default(),
|
||||
packed: pbuilder,
|
||||
enabled: true,
|
||||
}
|
||||
}
|
||||
|
||||
/// Enable ASCII case insensitivity. When set, byte strings added to this
|
||||
/// builder will be interpreted without respect to ASCII case.
|
||||
pub(crate) fn ascii_case_insensitive(mut self, yes: bool) -> Builder {
|
||||
self.ascii_case_insensitive = yes;
|
||||
self.start_bytes = self.start_bytes.ascii_case_insensitive(yes);
|
||||
self.rare_bytes = self.rare_bytes.ascii_case_insensitive(yes);
|
||||
self
|
||||
}
|
||||
|
||||
/// Return a prefilter suitable for quickly finding potential matches.
|
||||
///
|
||||
/// All patterns added to an Aho-Corasick automaton should be added to this
|
||||
/// builder before attempting to construct the prefilter.
|
||||
pub(crate) fn build(&self) -> Option<Prefilter> {
|
||||
if !self.enabled {
|
||||
debug!("prefilter not enabled, skipping");
|
||||
return None;
|
||||
}
|
||||
// If we only have one pattern, then deferring to memmem is always
|
||||
// the best choice. This is kind of a weird case, because, well, why
|
||||
// use Aho-Corasick if you only have one pattern? But maybe you don't
|
||||
// know exactly how many patterns you'll get up front, and you need to
|
||||
// support the option of multiple patterns. So instead of relying on
|
||||
// the caller to branch and use memmem explicitly, we just do it for
|
||||
// them.
|
||||
if !self.ascii_case_insensitive {
|
||||
if let Some(pre) = self.memmem.build() {
|
||||
debug!("using memmem prefilter");
|
||||
return Some(pre);
|
||||
}
|
||||
}
|
||||
let (packed, patlen, minlen) = if self.ascii_case_insensitive {
|
||||
(None, usize::MAX, 0)
|
||||
} else {
|
||||
let patlen = self.packed.as_ref().map_or(usize::MAX, |p| p.len());
|
||||
let minlen = self.packed.as_ref().map_or(0, |p| p.minimum_len());
|
||||
let packed =
|
||||
self.packed.as_ref().and_then(|b| b.build()).map(|s| {
|
||||
let memory_usage = s.memory_usage();
|
||||
debug!(
|
||||
"built packed prefilter (len: {}, \
|
||||
minimum pattern len: {}, memory usage: {}) \
|
||||
for consideration",
|
||||
patlen, minlen, memory_usage,
|
||||
);
|
||||
Prefilter { finder: Arc::new(Packed(s)), memory_usage }
|
||||
});
|
||||
(packed, patlen, minlen)
|
||||
};
|
||||
match (self.start_bytes.build(), self.rare_bytes.build()) {
|
||||
// If we could build both start and rare prefilters, then there are
|
||||
// a few cases in which we'd want to use the start-byte prefilter
|
||||
// over the rare-byte prefilter, since the former has lower
|
||||
// overhead.
|
||||
(prestart @ Some(_), prerare @ Some(_)) => {
|
||||
debug!(
|
||||
"both start (len={}, rank={}) and \
|
||||
rare (len={}, rank={}) byte prefilters \
|
||||
are available",
|
||||
self.start_bytes.count,
|
||||
self.start_bytes.rank_sum,
|
||||
self.rare_bytes.count,
|
||||
self.rare_bytes.rank_sum,
|
||||
);
|
||||
if patlen <= 16
|
||||
&& minlen >= 2
|
||||
&& self.start_bytes.count >= 3
|
||||
&& self.rare_bytes.count >= 3
|
||||
{
|
||||
debug!(
|
||||
"start and rare byte prefilters available, but \
|
||||
they're probably slower than packed so using \
|
||||
packed"
|
||||
);
|
||||
return packed;
|
||||
}
|
||||
// If the start-byte prefilter can scan for a smaller number
|
||||
// of bytes than the rare-byte prefilter, then it's probably
|
||||
// faster.
|
||||
let has_fewer_bytes =
|
||||
self.start_bytes.count < self.rare_bytes.count;
|
||||
// Otherwise, if the combined frequency rank of the detected
|
||||
// bytes in the start-byte prefilter is "close" to the combined
|
||||
// frequency rank of the rare-byte prefilter, then we pick
|
||||
// the start-byte prefilter even if the rare-byte prefilter
|
||||
// heuristically searches for rare bytes. This is because the
|
||||
// rare-byte prefilter has higher constant costs, so we tend to
|
||||
// prefer the start-byte prefilter when we can.
|
||||
let has_rarer_bytes =
|
||||
self.start_bytes.rank_sum <= self.rare_bytes.rank_sum + 50;
|
||||
if has_fewer_bytes {
|
||||
debug!(
|
||||
"using start byte prefilter because it has fewer
|
||||
bytes to search for than the rare byte prefilter",
|
||||
);
|
||||
prestart
|
||||
} else if has_rarer_bytes {
|
||||
debug!(
|
||||
"using start byte prefilter because its byte \
|
||||
frequency rank was determined to be \
|
||||
\"good enough\" relative to the rare byte prefilter \
|
||||
byte frequency rank",
|
||||
);
|
||||
prestart
|
||||
} else {
|
||||
debug!("using rare byte prefilter");
|
||||
prerare
|
||||
}
|
||||
}
|
||||
(prestart @ Some(_), None) => {
|
||||
if patlen <= 16 && minlen >= 2 && self.start_bytes.count >= 3 {
|
||||
debug!(
|
||||
"start byte prefilter available, but \
|
||||
it's probably slower than packed so using \
|
||||
packed"
|
||||
);
|
||||
return packed;
|
||||
}
|
||||
debug!(
|
||||
"have start byte prefilter but not rare byte prefilter, \
|
||||
so using start byte prefilter",
|
||||
);
|
||||
prestart
|
||||
}
|
||||
(None, prerare @ Some(_)) => {
|
||||
if patlen <= 16 && minlen >= 2 && self.rare_bytes.count >= 3 {
|
||||
debug!(
|
||||
"rare byte prefilter available, but \
|
||||
it's probably slower than packed so using \
|
||||
packed"
|
||||
);
|
||||
return packed;
|
||||
}
|
||||
debug!(
|
||||
"have rare byte prefilter but not start byte prefilter, \
|
||||
so using rare byte prefilter",
|
||||
);
|
||||
prerare
|
||||
}
|
||||
(None, None) if self.ascii_case_insensitive => {
|
||||
debug!(
|
||||
"no start or rare byte prefilter and ASCII case \
|
||||
insensitivity was enabled, so skipping prefilter",
|
||||
);
|
||||
None
|
||||
}
|
||||
(None, None) => {
|
||||
if packed.is_some() {
|
||||
debug!("falling back to packed prefilter");
|
||||
} else {
|
||||
debug!("no prefilter available");
|
||||
}
|
||||
packed
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Add a literal string to this prefilter builder.
|
||||
pub(crate) fn add(&mut self, bytes: &[u8]) {
|
||||
if bytes.is_empty() {
|
||||
self.enabled = false;
|
||||
}
|
||||
if !self.enabled {
|
||||
return;
|
||||
}
|
||||
self.count += 1;
|
||||
self.start_bytes.add(bytes);
|
||||
self.rare_bytes.add(bytes);
|
||||
self.memmem.add(bytes);
|
||||
if let Some(ref mut pbuilder) = self.packed {
|
||||
pbuilder.add(bytes);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// A type that wraps a packed searcher and implements the `Prefilter`
|
||||
/// interface.
|
||||
#[derive(Clone, Debug)]
|
||||
struct Packed(packed::Searcher);
|
||||
|
||||
impl PrefilterI for Packed {
|
||||
fn find_in(&self, haystack: &[u8], span: Span) -> Candidate {
|
||||
self.0
|
||||
.find_in(&haystack, span)
|
||||
.map_or(Candidate::None, Candidate::Match)
|
||||
}
|
||||
}
|
||||
|
||||
/// A builder for constructing a prefilter that uses memmem.
|
||||
#[derive(Debug, Default)]
|
||||
struct MemmemBuilder {
|
||||
/// The number of patterns that have been added.
|
||||
count: usize,
|
||||
/// The singular pattern to search for. This is only set when count==1.
|
||||
one: Option<Vec<u8>>,
|
||||
}
|
||||
|
||||
impl MemmemBuilder {
|
||||
fn build(&self) -> Option<Prefilter> {
|
||||
#[cfg(all(feature = "std", feature = "perf-literal"))]
|
||||
fn imp(builder: &MemmemBuilder) -> Option<Prefilter> {
|
||||
let pattern = builder.one.as_ref()?;
|
||||
assert_eq!(1, builder.count);
|
||||
let finder = Arc::new(Memmem(
|
||||
memchr::memmem::Finder::new(pattern).into_owned(),
|
||||
));
|
||||
let memory_usage = pattern.len();
|
||||
Some(Prefilter { finder, memory_usage })
|
||||
}
|
||||
|
||||
#[cfg(not(all(feature = "std", feature = "perf-literal")))]
|
||||
fn imp(_: &MemmemBuilder) -> Option<Prefilter> {
|
||||
None
|
||||
}
|
||||
|
||||
imp(self)
|
||||
}
|
||||
|
||||
fn add(&mut self, bytes: &[u8]) {
|
||||
self.count += 1;
|
||||
if self.count == 1 {
|
||||
self.one = Some(bytes.to_vec());
|
||||
} else {
|
||||
self.one = None;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// A type that wraps a SIMD accelerated single substring search from the
|
||||
/// `memchr` crate for use as a prefilter.
|
||||
///
|
||||
/// Currently, this prefilter is only active for Aho-Corasick searchers with
|
||||
/// a single pattern. In theory, this could be extended to support searchers
|
||||
/// that have a common prefix of more than one byte (for one byte, we would use
|
||||
/// memchr), but it's not clear if it's worth it or not.
|
||||
///
|
||||
/// Also, unfortunately, this currently also requires the 'std' feature to
|
||||
/// be enabled. That's because memchr doesn't have a no-std-but-with-alloc
|
||||
/// mode, and so APIs like Finder::into_owned aren't available when 'std' is
|
||||
/// disabled. But there should be an 'alloc' feature that brings in APIs like
|
||||
/// Finder::into_owned but doesn't use std-only features like runtime CPU
|
||||
/// feature detection.
|
||||
#[cfg(all(feature = "std", feature = "perf-literal"))]
|
||||
#[derive(Clone, Debug)]
|
||||
struct Memmem(memchr::memmem::Finder<'static>);
|
||||
|
||||
#[cfg(all(feature = "std", feature = "perf-literal"))]
|
||||
impl PrefilterI for Memmem {
|
||||
fn find_in(&self, haystack: &[u8], span: Span) -> Candidate {
|
||||
use crate::util::primitives::PatternID;
|
||||
|
||||
self.0.find(&haystack[span]).map_or(Candidate::None, |i| {
|
||||
let start = span.start + i;
|
||||
let end = start + self.0.needle().len();
|
||||
// N.B. We can declare a match and use a fixed pattern ID here
|
||||
// because a Memmem prefilter is only ever created for searchers
|
||||
// with exactly one pattern. Thus, every match is always a match
|
||||
// and it is always for the first and only pattern.
|
||||
Candidate::Match(Match::new(PatternID::ZERO, start..end))
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// A builder for constructing a rare byte prefilter.
|
||||
///
|
||||
/// A rare byte prefilter attempts to pick out a small set of rare bytes that
|
||||
/// occurr in the patterns, and then quickly scan to matches of those rare
|
||||
/// bytes.
|
||||
#[derive(Clone, Debug)]
|
||||
struct RareBytesBuilder {
|
||||
/// Whether this prefilter should account for ASCII case insensitivity or
|
||||
/// not.
|
||||
ascii_case_insensitive: bool,
|
||||
/// A set of rare bytes, indexed by byte value.
|
||||
rare_set: ByteSet,
|
||||
/// A set of byte offsets associated with bytes in a pattern. An entry
|
||||
/// corresponds to a particular bytes (its index) and is only non-zero if
|
||||
/// the byte occurred at an offset greater than 0 in at least one pattern.
|
||||
///
|
||||
/// If a byte's offset is not representable in 8 bits, then the rare bytes
|
||||
/// prefilter becomes inert.
|
||||
byte_offsets: RareByteOffsets,
|
||||
/// Whether this is available as a prefilter or not. This can be set to
|
||||
/// false during construction if a condition is seen that invalidates the
|
||||
/// use of the rare-byte prefilter.
|
||||
available: bool,
|
||||
/// The number of bytes set to an active value in `byte_offsets`.
|
||||
count: usize,
|
||||
/// The sum of frequency ranks for the rare bytes detected. This is
|
||||
/// intended to give a heuristic notion of how rare the bytes are.
|
||||
rank_sum: u16,
|
||||
}
|
||||
|
||||
/// A set of byte offsets, keyed by byte.
|
||||
#[derive(Clone, Copy)]
|
||||
struct RareByteOffsets {
|
||||
/// Each entry corresponds to the maximum offset of the corresponding
|
||||
/// byte across all patterns seen.
|
||||
set: [RareByteOffset; 256],
|
||||
}
|
||||
|
||||
impl RareByteOffsets {
|
||||
/// Create a new empty set of rare byte offsets.
|
||||
pub(crate) fn empty() -> RareByteOffsets {
|
||||
RareByteOffsets { set: [RareByteOffset::default(); 256] }
|
||||
}
|
||||
|
||||
/// Add the given offset for the given byte to this set. If the offset is
|
||||
/// greater than the existing offset, then it overwrites the previous
|
||||
/// value and returns false. If there is no previous value set, then this
|
||||
/// sets it and returns true.
|
||||
pub(crate) fn set(&mut self, byte: u8, off: RareByteOffset) {
|
||||
self.set[byte as usize].max =
|
||||
cmp::max(self.set[byte as usize].max, off.max);
|
||||
}
|
||||
}
|
||||
|
||||
impl core::fmt::Debug for RareByteOffsets {
|
||||
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
|
||||
let mut offsets = vec![];
|
||||
for off in self.set.iter() {
|
||||
if off.max > 0 {
|
||||
offsets.push(off);
|
||||
}
|
||||
}
|
||||
f.debug_struct("RareByteOffsets").field("set", &offsets).finish()
|
||||
}
|
||||
}
|
||||
|
||||
/// Offsets associated with an occurrence of a "rare" byte in any of the
|
||||
/// patterns used to construct a single Aho-Corasick automaton.
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
struct RareByteOffset {
|
||||
/// The maximum offset at which a particular byte occurs from the start
|
||||
/// of any pattern. This is used as a shift amount. That is, when an
|
||||
/// occurrence of this byte is found, the candidate position reported by
|
||||
/// the prefilter is `position_of_byte - max`, such that the automaton
|
||||
/// will begin its search at a position that is guaranteed to observe a
|
||||
/// match.
|
||||
///
|
||||
/// To avoid accidentally quadratic behavior, a prefilter is considered
|
||||
/// ineffective when it is asked to start scanning from a position that it
|
||||
/// has already scanned past.
|
||||
///
|
||||
/// Using a `u8` here means that if we ever see a pattern that's longer
|
||||
/// than 255 bytes, then the entire rare byte prefilter is disabled.
|
||||
max: u8,
|
||||
}
|
||||
|
||||
impl Default for RareByteOffset {
|
||||
fn default() -> RareByteOffset {
|
||||
RareByteOffset { max: 0 }
|
||||
}
|
||||
}
|
||||
|
||||
impl RareByteOffset {
|
||||
/// Create a new rare byte offset. If the given offset is too big, then
|
||||
/// None is returned. In that case, callers should render the rare bytes
|
||||
/// prefilter inert.
|
||||
fn new(max: usize) -> Option<RareByteOffset> {
|
||||
if max > u8::MAX as usize {
|
||||
None
|
||||
} else {
|
||||
Some(RareByteOffset { max: max as u8 })
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl RareBytesBuilder {
|
||||
/// Create a new builder for constructing a rare byte prefilter.
|
||||
fn new() -> RareBytesBuilder {
|
||||
RareBytesBuilder {
|
||||
ascii_case_insensitive: false,
|
||||
rare_set: ByteSet::empty(),
|
||||
byte_offsets: RareByteOffsets::empty(),
|
||||
available: true,
|
||||
count: 0,
|
||||
rank_sum: 0,
|
||||
}
|
||||
}
|
||||
|
||||
/// Enable ASCII case insensitivity. When set, byte strings added to this
|
||||
/// builder will be interpreted without respect to ASCII case.
|
||||
fn ascii_case_insensitive(mut self, yes: bool) -> RareBytesBuilder {
|
||||
self.ascii_case_insensitive = yes;
|
||||
self
|
||||
}
|
||||
|
||||
/// Build the rare bytes prefilter.
|
||||
///
|
||||
/// If there are more than 3 distinct rare bytes found, or if heuristics
|
||||
/// otherwise determine that this prefilter should not be used, then `None`
|
||||
/// is returned.
|
||||
fn build(&self) -> Option<Prefilter> {
|
||||
#[cfg(feature = "perf-literal")]
|
||||
fn imp(builder: &RareBytesBuilder) -> Option<Prefilter> {
|
||||
if !builder.available || builder.count > 3 {
|
||||
return None;
|
||||
}
|
||||
let (mut bytes, mut len) = ([0; 3], 0);
|
||||
for b in 0..=255 {
|
||||
if builder.rare_set.contains(b) {
|
||||
bytes[len] = b as u8;
|
||||
len += 1;
|
||||
}
|
||||
}
|
||||
let finder: Arc<dyn PrefilterI> = match len {
|
||||
0 => return None,
|
||||
1 => Arc::new(RareBytesOne {
|
||||
byte1: bytes[0],
|
||||
offset: builder.byte_offsets.set[bytes[0] as usize],
|
||||
}),
|
||||
2 => Arc::new(RareBytesTwo {
|
||||
offsets: builder.byte_offsets,
|
||||
byte1: bytes[0],
|
||||
byte2: bytes[1],
|
||||
}),
|
||||
3 => Arc::new(RareBytesThree {
|
||||
offsets: builder.byte_offsets,
|
||||
byte1: bytes[0],
|
||||
byte2: bytes[1],
|
||||
byte3: bytes[2],
|
||||
}),
|
||||
_ => unreachable!(),
|
||||
};
|
||||
Some(Prefilter { finder, memory_usage: 0 })
|
||||
}
|
||||
|
||||
#[cfg(not(feature = "perf-literal"))]
|
||||
fn imp(_: &RareBytesBuilder) -> Option<Prefilter> {
|
||||
None
|
||||
}
|
||||
|
||||
imp(self)
|
||||
}
|
||||
|
||||
/// Add a byte string to this builder.
|
||||
///
|
||||
/// All patterns added to an Aho-Corasick automaton should be added to this
|
||||
/// builder before attempting to construct the prefilter.
|
||||
fn add(&mut self, bytes: &[u8]) {
|
||||
// If we've already given up, then do nothing.
|
||||
if !self.available {
|
||||
return;
|
||||
}
|
||||
// If we've already blown our budget, then don't waste time looking
|
||||
// for more rare bytes.
|
||||
if self.count > 3 {
|
||||
self.available = false;
|
||||
return;
|
||||
}
|
||||
// If the pattern is too long, then our offset table is bunk, so
|
||||
// give up.
|
||||
if bytes.len() >= 256 {
|
||||
self.available = false;
|
||||
return;
|
||||
}
|
||||
let mut rarest = match bytes.get(0) {
|
||||
None => return,
|
||||
Some(&b) => (b, freq_rank(b)),
|
||||
};
|
||||
// The idea here is to look for the rarest byte in each pattern, and
|
||||
// add that to our set. As a special exception, if we see a byte that
|
||||
// we've already added, then we immediately stop and choose that byte,
|
||||
// even if there's another rare byte in the pattern. This helps us
|
||||
// apply the rare byte optimization in more cases by attempting to pick
|
||||
// bytes that are in common between patterns. So for example, if we
|
||||
// were searching for `Sherlock` and `lockjaw`, then this would pick
|
||||
// `k` for both patterns, resulting in the use of `memchr` instead of
|
||||
// `memchr2` for `k` and `j`.
|
||||
let mut found = false;
|
||||
for (pos, &b) in bytes.iter().enumerate() {
|
||||
self.set_offset(pos, b);
|
||||
if found {
|
||||
continue;
|
||||
}
|
||||
if self.rare_set.contains(b) {
|
||||
found = true;
|
||||
continue;
|
||||
}
|
||||
let rank = freq_rank(b);
|
||||
if rank < rarest.1 {
|
||||
rarest = (b, rank);
|
||||
}
|
||||
}
|
||||
if !found {
|
||||
self.add_rare_byte(rarest.0);
|
||||
}
|
||||
}
|
||||
|
||||
fn set_offset(&mut self, pos: usize, byte: u8) {
|
||||
// This unwrap is OK because pos is never bigger than our max.
|
||||
let offset = RareByteOffset::new(pos).unwrap();
|
||||
self.byte_offsets.set(byte, offset);
|
||||
if self.ascii_case_insensitive {
|
||||
self.byte_offsets.set(opposite_ascii_case(byte), offset);
|
||||
}
|
||||
}
|
||||
|
||||
fn add_rare_byte(&mut self, byte: u8) {
|
||||
self.add_one_rare_byte(byte);
|
||||
if self.ascii_case_insensitive {
|
||||
self.add_one_rare_byte(opposite_ascii_case(byte));
|
||||
}
|
||||
}
|
||||
|
||||
fn add_one_rare_byte(&mut self, byte: u8) {
|
||||
if !self.rare_set.contains(byte) {
|
||||
self.rare_set.add(byte);
|
||||
self.count += 1;
|
||||
self.rank_sum += freq_rank(byte) as u16;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// A prefilter for scanning for a single "rare" byte.
|
||||
#[cfg(feature = "perf-literal")]
|
||||
#[derive(Clone, Debug)]
|
||||
struct RareBytesOne {
|
||||
byte1: u8,
|
||||
offset: RareByteOffset,
|
||||
}
|
||||
|
||||
#[cfg(feature = "perf-literal")]
|
||||
impl PrefilterI for RareBytesOne {
|
||||
fn find_in(&self, haystack: &[u8], span: Span) -> Candidate {
|
||||
memchr::memchr(self.byte1, &haystack[span])
|
||||
.map(|i| {
|
||||
let pos = span.start + i;
|
||||
cmp::max(
|
||||
span.start,
|
||||
pos.saturating_sub(usize::from(self.offset.max)),
|
||||
)
|
||||
})
|
||||
.map_or(Candidate::None, Candidate::PossibleStartOfMatch)
|
||||
}
|
||||
}
|
||||
|
||||
/// A prefilter for scanning for two "rare" bytes.
|
||||
#[cfg(feature = "perf-literal")]
|
||||
#[derive(Clone, Debug)]
|
||||
struct RareBytesTwo {
|
||||
offsets: RareByteOffsets,
|
||||
byte1: u8,
|
||||
byte2: u8,
|
||||
}
|
||||
|
||||
#[cfg(feature = "perf-literal")]
|
||||
impl PrefilterI for RareBytesTwo {
|
||||
fn find_in(&self, haystack: &[u8], span: Span) -> Candidate {
|
||||
memchr::memchr2(self.byte1, self.byte2, &haystack[span])
|
||||
.map(|i| {
|
||||
let pos = span.start + i;
|
||||
let offset = self.offsets.set[usize::from(haystack[pos])].max;
|
||||
cmp::max(span.start, pos.saturating_sub(usize::from(offset)))
|
||||
})
|
||||
.map_or(Candidate::None, Candidate::PossibleStartOfMatch)
|
||||
}
|
||||
}
|
||||
|
||||
/// A prefilter for scanning for three "rare" bytes.
|
||||
#[cfg(feature = "perf-literal")]
|
||||
#[derive(Clone, Debug)]
|
||||
struct RareBytesThree {
|
||||
offsets: RareByteOffsets,
|
||||
byte1: u8,
|
||||
byte2: u8,
|
||||
byte3: u8,
|
||||
}
|
||||
|
||||
#[cfg(feature = "perf-literal")]
|
||||
impl PrefilterI for RareBytesThree {
|
||||
fn find_in(&self, haystack: &[u8], span: Span) -> Candidate {
|
||||
memchr::memchr3(self.byte1, self.byte2, self.byte3, &haystack[span])
|
||||
.map(|i| {
|
||||
let pos = span.start + i;
|
||||
let offset = self.offsets.set[usize::from(haystack[pos])].max;
|
||||
cmp::max(span.start, pos.saturating_sub(usize::from(offset)))
|
||||
})
|
||||
.map_or(Candidate::None, Candidate::PossibleStartOfMatch)
|
||||
}
|
||||
}
|
||||
|
||||
/// A builder for constructing a starting byte prefilter.
|
||||
///
|
||||
/// A starting byte prefilter is a simplistic prefilter that looks for possible
|
||||
/// matches by reporting all positions corresponding to a particular byte. This
|
||||
/// generally only takes affect when there are at most 3 distinct possible
|
||||
/// starting bytes. e.g., the patterns `foo`, `bar`, and `baz` have two
|
||||
/// distinct starting bytes (`f` and `b`), and this prefilter returns all
|
||||
/// occurrences of either `f` or `b`.
|
||||
///
|
||||
/// In some cases, a heuristic frequency analysis may determine that it would
|
||||
/// be better not to use this prefilter even when there are 3 or fewer distinct
|
||||
/// starting bytes.
|
||||
#[derive(Clone, Debug)]
|
||||
struct StartBytesBuilder {
|
||||
/// Whether this prefilter should account for ASCII case insensitivity or
|
||||
/// not.
|
||||
ascii_case_insensitive: bool,
|
||||
/// The set of starting bytes observed.
|
||||
byteset: Vec<bool>,
|
||||
/// The number of bytes set to true in `byteset`.
|
||||
count: usize,
|
||||
/// The sum of frequency ranks for the rare bytes detected. This is
|
||||
/// intended to give a heuristic notion of how rare the bytes are.
|
||||
rank_sum: u16,
|
||||
}
|
||||
|
||||
impl StartBytesBuilder {
|
||||
/// Create a new builder for constructing a start byte prefilter.
|
||||
fn new() -> StartBytesBuilder {
|
||||
StartBytesBuilder {
|
||||
ascii_case_insensitive: false,
|
||||
byteset: vec![false; 256],
|
||||
count: 0,
|
||||
rank_sum: 0,
|
||||
}
|
||||
}
|
||||
|
||||
/// Enable ASCII case insensitivity. When set, byte strings added to this
|
||||
/// builder will be interpreted without respect to ASCII case.
|
||||
fn ascii_case_insensitive(mut self, yes: bool) -> StartBytesBuilder {
|
||||
self.ascii_case_insensitive = yes;
|
||||
self
|
||||
}
|
||||
|
||||
/// Build the starting bytes prefilter.
|
||||
///
|
||||
/// If there are more than 3 distinct starting bytes, or if heuristics
|
||||
/// otherwise determine that this prefilter should not be used, then `None`
|
||||
/// is returned.
|
||||
fn build(&self) -> Option<Prefilter> {
|
||||
#[cfg(feature = "perf-literal")]
|
||||
fn imp(builder: &StartBytesBuilder) -> Option<Prefilter> {
|
||||
if builder.count > 3 {
|
||||
return None;
|
||||
}
|
||||
let (mut bytes, mut len) = ([0; 3], 0);
|
||||
for b in 0..256 {
|
||||
if !builder.byteset[b] {
|
||||
continue;
|
||||
}
|
||||
// We don't handle non-ASCII bytes for now. Getting non-ASCII
|
||||
// bytes right is trickier, since we generally don't want to put
|
||||
// a leading UTF-8 code unit into a prefilter that isn't ASCII,
|
||||
// since they can frequently. Instead, it would be better to use a
|
||||
// continuation byte, but this requires more sophisticated analysis
|
||||
// of the automaton and a richer prefilter API.
|
||||
if b > 0x7F {
|
||||
return None;
|
||||
}
|
||||
bytes[len] = b as u8;
|
||||
len += 1;
|
||||
}
|
||||
let finder: Arc<dyn PrefilterI> = match len {
|
||||
0 => return None,
|
||||
1 => Arc::new(StartBytesOne { byte1: bytes[0] }),
|
||||
2 => Arc::new(StartBytesTwo {
|
||||
byte1: bytes[0],
|
||||
byte2: bytes[1],
|
||||
}),
|
||||
3 => Arc::new(StartBytesThree {
|
||||
byte1: bytes[0],
|
||||
byte2: bytes[1],
|
||||
byte3: bytes[2],
|
||||
}),
|
||||
_ => unreachable!(),
|
||||
};
|
||||
Some(Prefilter { finder, memory_usage: 0 })
|
||||
}
|
||||
|
||||
#[cfg(not(feature = "perf-literal"))]
|
||||
fn imp(_: &StartBytesBuilder) -> Option<Prefilter> {
|
||||
None
|
||||
}
|
||||
|
||||
imp(self)
|
||||
}
|
||||
|
||||
/// Add a byte string to this builder.
|
||||
///
|
||||
/// All patterns added to an Aho-Corasick automaton should be added to this
|
||||
/// builder before attempting to construct the prefilter.
|
||||
fn add(&mut self, bytes: &[u8]) {
|
||||
if self.count > 3 {
|
||||
return;
|
||||
}
|
||||
if let Some(&byte) = bytes.get(0) {
|
||||
self.add_one_byte(byte);
|
||||
if self.ascii_case_insensitive {
|
||||
self.add_one_byte(opposite_ascii_case(byte));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn add_one_byte(&mut self, byte: u8) {
|
||||
if !self.byteset[byte as usize] {
|
||||
self.byteset[byte as usize] = true;
|
||||
self.count += 1;
|
||||
self.rank_sum += freq_rank(byte) as u16;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// A prefilter for scanning for a single starting byte.
|
||||
#[cfg(feature = "perf-literal")]
|
||||
#[derive(Clone, Debug)]
|
||||
struct StartBytesOne {
|
||||
byte1: u8,
|
||||
}
|
||||
|
||||
#[cfg(feature = "perf-literal")]
|
||||
impl PrefilterI for StartBytesOne {
|
||||
fn find_in(&self, haystack: &[u8], span: Span) -> Candidate {
|
||||
memchr::memchr(self.byte1, &haystack[span])
|
||||
.map(|i| span.start + i)
|
||||
.map_or(Candidate::None, Candidate::PossibleStartOfMatch)
|
||||
}
|
||||
}
|
||||
|
||||
/// A prefilter for scanning for two starting bytes.
|
||||
#[cfg(feature = "perf-literal")]
|
||||
#[derive(Clone, Debug)]
|
||||
struct StartBytesTwo {
|
||||
byte1: u8,
|
||||
byte2: u8,
|
||||
}
|
||||
|
||||
#[cfg(feature = "perf-literal")]
|
||||
impl PrefilterI for StartBytesTwo {
|
||||
fn find_in(&self, haystack: &[u8], span: Span) -> Candidate {
|
||||
memchr::memchr2(self.byte1, self.byte2, &haystack[span])
|
||||
.map(|i| span.start + i)
|
||||
.map_or(Candidate::None, Candidate::PossibleStartOfMatch)
|
||||
}
|
||||
}
|
||||
|
||||
/// A prefilter for scanning for three starting bytes.
|
||||
#[cfg(feature = "perf-literal")]
|
||||
#[derive(Clone, Debug)]
|
||||
struct StartBytesThree {
|
||||
byte1: u8,
|
||||
byte2: u8,
|
||||
byte3: u8,
|
||||
}
|
||||
|
||||
#[cfg(feature = "perf-literal")]
|
||||
impl PrefilterI for StartBytesThree {
|
||||
fn find_in(&self, haystack: &[u8], span: Span) -> Candidate {
|
||||
memchr::memchr3(self.byte1, self.byte2, self.byte3, &haystack[span])
|
||||
.map(|i| span.start + i)
|
||||
.map_or(Candidate::None, Candidate::PossibleStartOfMatch)
|
||||
}
|
||||
}
|
||||
|
||||
/// If the given byte is an ASCII letter, then return it in the opposite case.
|
||||
/// e.g., Given `b'A'`, this returns `b'a'`, and given `b'a'`, this returns
|
||||
/// `b'A'`. If a non-ASCII letter is given, then the given byte is returned.
|
||||
pub(crate) fn opposite_ascii_case(b: u8) -> u8 {
|
||||
if b'A' <= b && b <= b'Z' {
|
||||
b.to_ascii_lowercase()
|
||||
} else if b'a' <= b && b <= b'z' {
|
||||
b.to_ascii_uppercase()
|
||||
} else {
|
||||
b
|
||||
}
|
||||
}
|
||||
|
||||
/// Return the frequency rank of the given byte. The higher the rank, the more
|
||||
/// common the byte (heuristically speaking).
|
||||
fn freq_rank(b: u8) -> u8 {
|
||||
use crate::util::byte_frequencies::BYTE_FREQUENCIES;
|
||||
BYTE_FREQUENCIES[b as usize]
|
||||
}
|
759
vendor/aho-corasick/src/util/primitives.rs
vendored
Normal file
759
vendor/aho-corasick/src/util/primitives.rs
vendored
Normal file
@ -0,0 +1,759 @@
|
||||
/*!
|
||||
Lower level primitive types that are useful in a variety of circumstances.
|
||||
|
||||
# Overview
|
||||
|
||||
This list represents the principle types in this module and briefly describes
|
||||
when you might want to use them.
|
||||
|
||||
* [`PatternID`] - A type that represents the identifier of a regex pattern.
|
||||
This is probably the most widely used type in this module (which is why it's
|
||||
also re-exported in the crate root).
|
||||
* [`StateID`] - A type the represents the identifier of a finite automaton
|
||||
state. This is used for both NFAs and DFAs, with the notable exception of
|
||||
the hybrid NFA/DFA. (The hybrid NFA/DFA uses a special purpose "lazy" state
|
||||
identifier.)
|
||||
* [`SmallIndex`] - The internal representation of both a `PatternID` and a
|
||||
`StateID`. Its purpose is to serve as a type that can index memory without
|
||||
being as big as a `usize` on 64-bit targets. The main idea behind this type
|
||||
is that there are many things in regex engines that will, in practice, never
|
||||
overflow a 32-bit integer. (For example, like the number of patterns in a regex
|
||||
or the number of states in an NFA.) Thus, a `SmallIndex` can be used to index
|
||||
memory without peppering `as` casts everywhere. Moreover, it forces callers
|
||||
to handle errors in the case where, somehow, the value would otherwise overflow
|
||||
either a 32-bit integer or a `usize` (e.g., on 16-bit targets).
|
||||
*/
|
||||
|
||||
// The macro we use to define some types below adds methods that we don't
|
||||
// use on some of the types. There isn't much, so we just squash the warning.
|
||||
#![allow(dead_code)]
|
||||
|
||||
use alloc::vec::Vec;
|
||||
|
||||
use crate::util::int::{Usize, U16, U32, U64};
|
||||
|
||||
/// A type that represents a "small" index.
|
||||
///
|
||||
/// The main idea of this type is to provide something that can index memory,
|
||||
/// but uses less memory than `usize` on 64-bit systems. Specifically, its
|
||||
/// representation is always a `u32` and has `repr(transparent)` enabled. (So
|
||||
/// it is safe to transmute between a `u32` and a `SmallIndex`.)
|
||||
///
|
||||
/// A small index is typically useful in cases where there is no practical way
|
||||
/// that the index will overflow a 32-bit integer. A good example of this is
|
||||
/// an NFA state. If you could somehow build an NFA with `2^30` states, its
|
||||
/// memory usage would be exorbitant and its runtime execution would be so
|
||||
/// slow as to be completely worthless. Therefore, this crate generally deems
|
||||
/// it acceptable to return an error if it would otherwise build an NFA that
|
||||
/// requires a slice longer than what a 32-bit integer can index. In exchange,
|
||||
/// we can use 32-bit indices instead of 64-bit indices in various places.
|
||||
///
|
||||
/// This type ensures this by providing a constructor that will return an error
|
||||
/// if its argument cannot fit into the type. This makes it much easier to
|
||||
/// handle these sorts of boundary cases that are otherwise extremely subtle.
|
||||
///
|
||||
/// On all targets, this type guarantees that its value will fit in a `u32`,
|
||||
/// `i32`, `usize` and an `isize`. This means that on 16-bit targets, for
|
||||
/// example, this type's maximum value will never overflow an `isize`,
|
||||
/// which means it will never overflow a `i16` even though its internal
|
||||
/// representation is still a `u32`.
|
||||
///
|
||||
/// The purpose for making the type fit into even signed integer types like
|
||||
/// `isize` is to guarantee that the difference between any two small indices
|
||||
/// is itself also a small index. This is useful in certain contexts, e.g.,
|
||||
/// for delta encoding.
|
||||
///
|
||||
/// # Other types
|
||||
///
|
||||
/// The following types wrap `SmallIndex` to provide a more focused use case:
|
||||
///
|
||||
/// * [`PatternID`] is for representing the identifiers of patterns.
|
||||
/// * [`StateID`] is for representing the identifiers of states in finite
|
||||
/// automata. It is used for both NFAs and DFAs.
|
||||
///
|
||||
/// # Representation
|
||||
///
|
||||
/// This type is always represented internally by a `u32` and is marked as
|
||||
/// `repr(transparent)`. Thus, this type always has the same representation as
|
||||
/// a `u32`. It is thus safe to transmute between a `u32` and a `SmallIndex`.
|
||||
///
|
||||
/// # Indexing
|
||||
///
|
||||
/// For convenience, callers may use a `SmallIndex` to index slices.
|
||||
///
|
||||
/// # Safety
|
||||
///
|
||||
/// While a `SmallIndex` is meant to guarantee that its value fits into `usize`
|
||||
/// without using as much space as a `usize` on all targets, callers must
|
||||
/// not rely on this property for safety. Callers may choose to rely on this
|
||||
/// property for correctness however. For example, creating a `SmallIndex` with
|
||||
/// an invalid value can be done in entirely safe code. This may in turn result
|
||||
/// in panics or silent logical errors.
|
||||
#[derive(
|
||||
Clone, Copy, Debug, Default, Eq, Hash, PartialEq, PartialOrd, Ord,
|
||||
)]
|
||||
#[repr(transparent)]
|
||||
pub(crate) struct SmallIndex(u32);
|
||||
|
||||
impl SmallIndex {
|
||||
/// The maximum index value.
|
||||
#[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))]
|
||||
pub const MAX: SmallIndex =
|
||||
// FIXME: Use as_usize() once const functions in traits are stable.
|
||||
SmallIndex::new_unchecked(core::i32::MAX as usize - 1);
|
||||
|
||||
/// The maximum index value.
|
||||
#[cfg(target_pointer_width = "16")]
|
||||
pub const MAX: SmallIndex =
|
||||
SmallIndex::new_unchecked(core::isize::MAX - 1);
|
||||
|
||||
/// The total number of values that can be represented as a small index.
|
||||
pub const LIMIT: usize = SmallIndex::MAX.as_usize() + 1;
|
||||
|
||||
/// The zero index value.
|
||||
pub const ZERO: SmallIndex = SmallIndex::new_unchecked(0);
|
||||
|
||||
/// The number of bytes that a single small index uses in memory.
|
||||
pub const SIZE: usize = core::mem::size_of::<SmallIndex>();
|
||||
|
||||
/// Create a new small index.
|
||||
///
|
||||
/// If the given index exceeds [`SmallIndex::MAX`], then this returns
|
||||
/// an error.
|
||||
#[inline]
|
||||
pub fn new(index: usize) -> Result<SmallIndex, SmallIndexError> {
|
||||
SmallIndex::try_from(index)
|
||||
}
|
||||
|
||||
/// Create a new small index without checking whether the given value
|
||||
/// exceeds [`SmallIndex::MAX`].
|
||||
///
|
||||
/// Using this routine with an invalid index value will result in
|
||||
/// unspecified behavior, but *not* undefined behavior. In particular, an
|
||||
/// invalid index value is likely to cause panics or possibly even silent
|
||||
/// logical errors.
|
||||
///
|
||||
/// Callers must never rely on a `SmallIndex` to be within a certain range
|
||||
/// for memory safety.
|
||||
#[inline]
|
||||
pub const fn new_unchecked(index: usize) -> SmallIndex {
|
||||
// FIXME: Use as_u32() once const functions in traits are stable.
|
||||
SmallIndex::from_u32_unchecked(index as u32)
|
||||
}
|
||||
|
||||
/// Create a new small index from a `u32` without checking whether the
|
||||
/// given value exceeds [`SmallIndex::MAX`].
|
||||
///
|
||||
/// Using this routine with an invalid index value will result in
|
||||
/// unspecified behavior, but *not* undefined behavior. In particular, an
|
||||
/// invalid index value is likely to cause panics or possibly even silent
|
||||
/// logical errors.
|
||||
///
|
||||
/// Callers must never rely on a `SmallIndex` to be within a certain range
|
||||
/// for memory safety.
|
||||
#[inline]
|
||||
pub const fn from_u32_unchecked(index: u32) -> SmallIndex {
|
||||
SmallIndex(index)
|
||||
}
|
||||
|
||||
/// Like [`SmallIndex::new`], but panics if the given index is not valid.
|
||||
#[inline]
|
||||
pub fn must(index: usize) -> SmallIndex {
|
||||
SmallIndex::new(index).expect("invalid small index")
|
||||
}
|
||||
|
||||
/// Return this small index as a `usize`. This is guaranteed to never
|
||||
/// overflow `usize`.
|
||||
#[inline]
|
||||
pub const fn as_usize(&self) -> usize {
|
||||
// FIXME: Use as_usize() once const functions in traits are stable.
|
||||
self.0 as usize
|
||||
}
|
||||
|
||||
/// Return this small index as a `u64`. This is guaranteed to never
|
||||
/// overflow.
|
||||
#[inline]
|
||||
pub const fn as_u64(&self) -> u64 {
|
||||
// FIXME: Use u64::from() once const functions in traits are stable.
|
||||
self.0 as u64
|
||||
}
|
||||
|
||||
/// Return the internal `u32` of this small index. This is guaranteed to
|
||||
/// never overflow `u32`.
|
||||
#[inline]
|
||||
pub const fn as_u32(&self) -> u32 {
|
||||
self.0
|
||||
}
|
||||
|
||||
/// Return the internal `u32` of this small index represented as an `i32`.
|
||||
/// This is guaranteed to never overflow an `i32`.
|
||||
#[inline]
|
||||
pub const fn as_i32(&self) -> i32 {
|
||||
// This is OK because we guarantee that our max value is <= i32::MAX.
|
||||
self.0 as i32
|
||||
}
|
||||
|
||||
/// Returns one more than this small index as a usize.
|
||||
///
|
||||
/// Since a small index has constraints on its maximum value, adding `1` to
|
||||
/// it will always fit in a `usize`, `isize`, `u32` and a `i32`.
|
||||
#[inline]
|
||||
pub fn one_more(&self) -> usize {
|
||||
self.as_usize() + 1
|
||||
}
|
||||
|
||||
/// Decode this small index from the bytes given using the native endian
|
||||
/// byte order for the current target.
|
||||
///
|
||||
/// If the decoded integer is not representable as a small index for the
|
||||
/// current target, then this returns an error.
|
||||
#[inline]
|
||||
pub fn from_ne_bytes(
|
||||
bytes: [u8; 4],
|
||||
) -> Result<SmallIndex, SmallIndexError> {
|
||||
let id = u32::from_ne_bytes(bytes);
|
||||
if id > SmallIndex::MAX.as_u32() {
|
||||
return Err(SmallIndexError { attempted: u64::from(id) });
|
||||
}
|
||||
Ok(SmallIndex::new_unchecked(id.as_usize()))
|
||||
}
|
||||
|
||||
/// Decode this small index from the bytes given using the native endian
|
||||
/// byte order for the current target.
|
||||
///
|
||||
/// This is analogous to [`SmallIndex::new_unchecked`] in that is does not
|
||||
/// check whether the decoded integer is representable as a small index.
|
||||
#[inline]
|
||||
pub fn from_ne_bytes_unchecked(bytes: [u8; 4]) -> SmallIndex {
|
||||
SmallIndex::new_unchecked(u32::from_ne_bytes(bytes).as_usize())
|
||||
}
|
||||
|
||||
/// Return the underlying small index integer as raw bytes in native endian
|
||||
/// format.
|
||||
#[inline]
|
||||
pub fn to_ne_bytes(&self) -> [u8; 4] {
|
||||
self.0.to_ne_bytes()
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> core::ops::Index<SmallIndex> for [T] {
|
||||
type Output = T;
|
||||
|
||||
#[inline]
|
||||
fn index(&self, index: SmallIndex) -> &T {
|
||||
&self[index.as_usize()]
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> core::ops::IndexMut<SmallIndex> for [T] {
|
||||
#[inline]
|
||||
fn index_mut(&mut self, index: SmallIndex) -> &mut T {
|
||||
&mut self[index.as_usize()]
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> core::ops::Index<SmallIndex> for Vec<T> {
|
||||
type Output = T;
|
||||
|
||||
#[inline]
|
||||
fn index(&self, index: SmallIndex) -> &T {
|
||||
&self[index.as_usize()]
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> core::ops::IndexMut<SmallIndex> for Vec<T> {
|
||||
#[inline]
|
||||
fn index_mut(&mut self, index: SmallIndex) -> &mut T {
|
||||
&mut self[index.as_usize()]
|
||||
}
|
||||
}
|
||||
|
||||
impl From<StateID> for SmallIndex {
|
||||
fn from(sid: StateID) -> SmallIndex {
|
||||
sid.0
|
||||
}
|
||||
}
|
||||
|
||||
impl From<PatternID> for SmallIndex {
|
||||
fn from(pid: PatternID) -> SmallIndex {
|
||||
pid.0
|
||||
}
|
||||
}
|
||||
|
||||
impl From<u8> for SmallIndex {
|
||||
fn from(index: u8) -> SmallIndex {
|
||||
SmallIndex::new_unchecked(usize::from(index))
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<u16> for SmallIndex {
|
||||
type Error = SmallIndexError;
|
||||
|
||||
fn try_from(index: u16) -> Result<SmallIndex, SmallIndexError> {
|
||||
if u32::from(index) > SmallIndex::MAX.as_u32() {
|
||||
return Err(SmallIndexError { attempted: u64::from(index) });
|
||||
}
|
||||
Ok(SmallIndex::new_unchecked(index.as_usize()))
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<u32> for SmallIndex {
|
||||
type Error = SmallIndexError;
|
||||
|
||||
fn try_from(index: u32) -> Result<SmallIndex, SmallIndexError> {
|
||||
if index > SmallIndex::MAX.as_u32() {
|
||||
return Err(SmallIndexError { attempted: u64::from(index) });
|
||||
}
|
||||
Ok(SmallIndex::new_unchecked(index.as_usize()))
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<u64> for SmallIndex {
|
||||
type Error = SmallIndexError;
|
||||
|
||||
fn try_from(index: u64) -> Result<SmallIndex, SmallIndexError> {
|
||||
if index > SmallIndex::MAX.as_u64() {
|
||||
return Err(SmallIndexError { attempted: index });
|
||||
}
|
||||
Ok(SmallIndex::new_unchecked(index.as_usize()))
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<usize> for SmallIndex {
|
||||
type Error = SmallIndexError;
|
||||
|
||||
fn try_from(index: usize) -> Result<SmallIndex, SmallIndexError> {
|
||||
if index > SmallIndex::MAX.as_usize() {
|
||||
return Err(SmallIndexError { attempted: index.as_u64() });
|
||||
}
|
||||
Ok(SmallIndex::new_unchecked(index))
|
||||
}
|
||||
}
|
||||
|
||||
/// This error occurs when a small index could not be constructed.
|
||||
///
|
||||
/// This occurs when given an integer exceeding the maximum small index value.
|
||||
///
|
||||
/// When the `std` feature is enabled, this implements the `Error` trait.
|
||||
#[derive(Clone, Debug, Eq, PartialEq)]
|
||||
pub struct SmallIndexError {
|
||||
attempted: u64,
|
||||
}
|
||||
|
||||
impl SmallIndexError {
|
||||
/// Returns the value that could not be converted to a small index.
|
||||
pub fn attempted(&self) -> u64 {
|
||||
self.attempted
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "std")]
|
||||
impl std::error::Error for SmallIndexError {}
|
||||
|
||||
impl core::fmt::Display for SmallIndexError {
|
||||
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
|
||||
write!(
|
||||
f,
|
||||
"failed to create small index from {:?}, which exceeds {:?}",
|
||||
self.attempted(),
|
||||
SmallIndex::MAX,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub(crate) struct SmallIndexIter {
|
||||
rng: core::ops::Range<usize>,
|
||||
}
|
||||
|
||||
impl Iterator for SmallIndexIter {
|
||||
type Item = SmallIndex;
|
||||
|
||||
fn next(&mut self) -> Option<SmallIndex> {
|
||||
if self.rng.start >= self.rng.end {
|
||||
return None;
|
||||
}
|
||||
let next_id = self.rng.start + 1;
|
||||
let id = core::mem::replace(&mut self.rng.start, next_id);
|
||||
// new_unchecked is OK since we asserted that the number of
|
||||
// elements in this iterator will fit in an ID at construction.
|
||||
Some(SmallIndex::new_unchecked(id))
|
||||
}
|
||||
}
|
||||
|
||||
macro_rules! index_type_impls {
|
||||
($name:ident, $err:ident, $iter:ident, $withiter:ident) => {
|
||||
impl $name {
|
||||
/// The maximum value.
|
||||
pub const MAX: $name = $name(SmallIndex::MAX);
|
||||
|
||||
/// The total number of values that can be represented.
|
||||
pub const LIMIT: usize = SmallIndex::LIMIT;
|
||||
|
||||
/// The zero value.
|
||||
pub const ZERO: $name = $name(SmallIndex::ZERO);
|
||||
|
||||
/// The number of bytes that a single value uses in memory.
|
||||
pub const SIZE: usize = SmallIndex::SIZE;
|
||||
|
||||
/// Create a new value that is represented by a "small index."
|
||||
///
|
||||
/// If the given index exceeds the maximum allowed value, then this
|
||||
/// returns an error.
|
||||
#[inline]
|
||||
pub fn new(value: usize) -> Result<$name, $err> {
|
||||
SmallIndex::new(value).map($name).map_err($err)
|
||||
}
|
||||
|
||||
/// Create a new value without checking whether the given argument
|
||||
/// exceeds the maximum.
|
||||
///
|
||||
/// Using this routine with an invalid value will result in
|
||||
/// unspecified behavior, but *not* undefined behavior. In
|
||||
/// particular, an invalid ID value is likely to cause panics or
|
||||
/// possibly even silent logical errors.
|
||||
///
|
||||
/// Callers must never rely on this type to be within a certain
|
||||
/// range for memory safety.
|
||||
#[inline]
|
||||
pub const fn new_unchecked(value: usize) -> $name {
|
||||
$name(SmallIndex::new_unchecked(value))
|
||||
}
|
||||
|
||||
/// Create a new value from a `u32` without checking whether the
|
||||
/// given value exceeds the maximum.
|
||||
///
|
||||
/// Using this routine with an invalid value will result in
|
||||
/// unspecified behavior, but *not* undefined behavior. In
|
||||
/// particular, an invalid ID value is likely to cause panics or
|
||||
/// possibly even silent logical errors.
|
||||
///
|
||||
/// Callers must never rely on this type to be within a certain
|
||||
/// range for memory safety.
|
||||
#[inline]
|
||||
pub const fn from_u32_unchecked(index: u32) -> $name {
|
||||
$name(SmallIndex::from_u32_unchecked(index))
|
||||
}
|
||||
|
||||
/// Like `new`, but panics if the given value is not valid.
|
||||
#[inline]
|
||||
pub fn must(value: usize) -> $name {
|
||||
$name::new(value).expect(concat!(
|
||||
"invalid ",
|
||||
stringify!($name),
|
||||
" value"
|
||||
))
|
||||
}
|
||||
|
||||
/// Return the internal value as a `usize`. This is guaranteed to
|
||||
/// never overflow `usize`.
|
||||
#[inline]
|
||||
pub const fn as_usize(&self) -> usize {
|
||||
self.0.as_usize()
|
||||
}
|
||||
|
||||
/// Return the internal value as a `u64`. This is guaranteed to
|
||||
/// never overflow.
|
||||
#[inline]
|
||||
pub const fn as_u64(&self) -> u64 {
|
||||
self.0.as_u64()
|
||||
}
|
||||
|
||||
/// Return the internal value as a `u32`. This is guaranteed to
|
||||
/// never overflow `u32`.
|
||||
#[inline]
|
||||
pub const fn as_u32(&self) -> u32 {
|
||||
self.0.as_u32()
|
||||
}
|
||||
|
||||
/// Return the internal value as a `i32`. This is guaranteed to
|
||||
/// never overflow an `i32`.
|
||||
#[inline]
|
||||
pub const fn as_i32(&self) -> i32 {
|
||||
self.0.as_i32()
|
||||
}
|
||||
|
||||
/// Returns one more than this value as a usize.
|
||||
///
|
||||
/// Since values represented by a "small index" have constraints
|
||||
/// on their maximum value, adding `1` to it will always fit in a
|
||||
/// `usize`, `u32` and a `i32`.
|
||||
#[inline]
|
||||
pub fn one_more(&self) -> usize {
|
||||
self.0.one_more()
|
||||
}
|
||||
|
||||
/// Decode this value from the bytes given using the native endian
|
||||
/// byte order for the current target.
|
||||
///
|
||||
/// If the decoded integer is not representable as a small index
|
||||
/// for the current target, then this returns an error.
|
||||
#[inline]
|
||||
pub fn from_ne_bytes(bytes: [u8; 4]) -> Result<$name, $err> {
|
||||
SmallIndex::from_ne_bytes(bytes).map($name).map_err($err)
|
||||
}
|
||||
|
||||
/// Decode this value from the bytes given using the native endian
|
||||
/// byte order for the current target.
|
||||
///
|
||||
/// This is analogous to `new_unchecked` in that is does not check
|
||||
/// whether the decoded integer is representable as a small index.
|
||||
#[inline]
|
||||
pub fn from_ne_bytes_unchecked(bytes: [u8; 4]) -> $name {
|
||||
$name(SmallIndex::from_ne_bytes_unchecked(bytes))
|
||||
}
|
||||
|
||||
/// Return the underlying integer as raw bytes in native endian
|
||||
/// format.
|
||||
#[inline]
|
||||
pub fn to_ne_bytes(&self) -> [u8; 4] {
|
||||
self.0.to_ne_bytes()
|
||||
}
|
||||
|
||||
/// Returns an iterator over all values from 0 up to and not
|
||||
/// including the given length.
|
||||
///
|
||||
/// If the given length exceeds this type's limit, then this
|
||||
/// panics.
|
||||
pub(crate) fn iter(len: usize) -> $iter {
|
||||
$iter::new(len)
|
||||
}
|
||||
}
|
||||
|
||||
// We write our own Debug impl so that we get things like PatternID(5)
|
||||
// instead of PatternID(SmallIndex(5)).
|
||||
impl core::fmt::Debug for $name {
|
||||
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
|
||||
f.debug_tuple(stringify!($name)).field(&self.as_u32()).finish()
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> core::ops::Index<$name> for [T] {
|
||||
type Output = T;
|
||||
|
||||
#[inline]
|
||||
fn index(&self, index: $name) -> &T {
|
||||
&self[index.as_usize()]
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> core::ops::IndexMut<$name> for [T] {
|
||||
#[inline]
|
||||
fn index_mut(&mut self, index: $name) -> &mut T {
|
||||
&mut self[index.as_usize()]
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> core::ops::Index<$name> for Vec<T> {
|
||||
type Output = T;
|
||||
|
||||
#[inline]
|
||||
fn index(&self, index: $name) -> &T {
|
||||
&self[index.as_usize()]
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> core::ops::IndexMut<$name> for Vec<T> {
|
||||
#[inline]
|
||||
fn index_mut(&mut self, index: $name) -> &mut T {
|
||||
&mut self[index.as_usize()]
|
||||
}
|
||||
}
|
||||
|
||||
impl From<SmallIndex> for $name {
|
||||
fn from(index: SmallIndex) -> $name {
|
||||
$name(index)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<u8> for $name {
|
||||
fn from(value: u8) -> $name {
|
||||
$name(SmallIndex::from(value))
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<u16> for $name {
|
||||
type Error = $err;
|
||||
|
||||
fn try_from(value: u16) -> Result<$name, $err> {
|
||||
SmallIndex::try_from(value).map($name).map_err($err)
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<u32> for $name {
|
||||
type Error = $err;
|
||||
|
||||
fn try_from(value: u32) -> Result<$name, $err> {
|
||||
SmallIndex::try_from(value).map($name).map_err($err)
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<u64> for $name {
|
||||
type Error = $err;
|
||||
|
||||
fn try_from(value: u64) -> Result<$name, $err> {
|
||||
SmallIndex::try_from(value).map($name).map_err($err)
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<usize> for $name {
|
||||
type Error = $err;
|
||||
|
||||
fn try_from(value: usize) -> Result<$name, $err> {
|
||||
SmallIndex::try_from(value).map($name).map_err($err)
|
||||
}
|
||||
}
|
||||
|
||||
/// This error occurs when an ID could not be constructed.
|
||||
///
|
||||
/// This occurs when given an integer exceeding the maximum allowed
|
||||
/// value.
|
||||
///
|
||||
/// When the `std` feature is enabled, this implements the `Error`
|
||||
/// trait.
|
||||
#[derive(Clone, Debug, Eq, PartialEq)]
|
||||
pub struct $err(SmallIndexError);
|
||||
|
||||
impl $err {
|
||||
/// Returns the value that could not be converted to an ID.
|
||||
pub fn attempted(&self) -> u64 {
|
||||
self.0.attempted()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "std")]
|
||||
impl std::error::Error for $err {}
|
||||
|
||||
impl core::fmt::Display for $err {
|
||||
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
|
||||
write!(
|
||||
f,
|
||||
"failed to create {} from {:?}, which exceeds {:?}",
|
||||
stringify!($name),
|
||||
self.attempted(),
|
||||
$name::MAX,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub(crate) struct $iter(SmallIndexIter);
|
||||
|
||||
impl $iter {
|
||||
fn new(len: usize) -> $iter {
|
||||
assert!(
|
||||
len <= $name::LIMIT,
|
||||
"cannot create iterator for {} when number of \
|
||||
elements exceed {:?}",
|
||||
stringify!($name),
|
||||
$name::LIMIT,
|
||||
);
|
||||
$iter(SmallIndexIter { rng: 0..len })
|
||||
}
|
||||
}
|
||||
|
||||
impl Iterator for $iter {
|
||||
type Item = $name;
|
||||
|
||||
fn next(&mut self) -> Option<$name> {
|
||||
self.0.next().map($name)
|
||||
}
|
||||
}
|
||||
|
||||
/// An iterator adapter that is like std::iter::Enumerate, but attaches
|
||||
/// small index values instead. It requires `ExactSizeIterator`. At
|
||||
/// construction, it ensures that the index of each element in the
|
||||
/// iterator is representable in the corresponding small index type.
|
||||
#[derive(Clone, Debug)]
|
||||
pub(crate) struct $withiter<I> {
|
||||
it: I,
|
||||
ids: $iter,
|
||||
}
|
||||
|
||||
impl<I: Iterator + ExactSizeIterator> $withiter<I> {
|
||||
fn new(it: I) -> $withiter<I> {
|
||||
let ids = $name::iter(it.len());
|
||||
$withiter { it, ids }
|
||||
}
|
||||
}
|
||||
|
||||
impl<I: Iterator + ExactSizeIterator> Iterator for $withiter<I> {
|
||||
type Item = ($name, I::Item);
|
||||
|
||||
fn next(&mut self) -> Option<($name, I::Item)> {
|
||||
let item = self.it.next()?;
|
||||
// Number of elements in this iterator must match, according
|
||||
// to contract of ExactSizeIterator.
|
||||
let id = self.ids.next().unwrap();
|
||||
Some((id, item))
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
/// The identifier of a pattern in an Aho-Corasick automaton.
|
||||
///
|
||||
/// It is represented by a `u32` even on 64-bit systems in order to conserve
|
||||
/// space. Namely, on all targets, this type guarantees that its value will
|
||||
/// fit in a `u32`, `i32`, `usize` and an `isize`. This means that on 16-bit
|
||||
/// targets, for example, this type's maximum value will never overflow an
|
||||
/// `isize`, which means it will never overflow a `i16` even though its
|
||||
/// internal representation is still a `u32`.
|
||||
///
|
||||
/// # Safety
|
||||
///
|
||||
/// While a `PatternID` is meant to guarantee that its value fits into `usize`
|
||||
/// without using as much space as a `usize` on all targets, callers must
|
||||
/// not rely on this property for safety. Callers may choose to rely on this
|
||||
/// property for correctness however. For example, creating a `StateID` with an
|
||||
/// invalid value can be done in entirely safe code. This may in turn result in
|
||||
/// panics or silent logical errors.
|
||||
#[derive(Clone, Copy, Default, Eq, Hash, PartialEq, PartialOrd, Ord)]
|
||||
#[repr(transparent)]
|
||||
pub struct PatternID(SmallIndex);
|
||||
|
||||
/// The identifier of a finite automaton state.
|
||||
///
|
||||
/// It is represented by a `u32` even on 64-bit systems in order to conserve
|
||||
/// space. Namely, on all targets, this type guarantees that its value will
|
||||
/// fit in a `u32`, `i32`, `usize` and an `isize`. This means that on 16-bit
|
||||
/// targets, for example, this type's maximum value will never overflow an
|
||||
/// `isize`, which means it will never overflow a `i16` even though its
|
||||
/// internal representation is still a `u32`.
|
||||
///
|
||||
/// # Safety
|
||||
///
|
||||
/// While a `StateID` is meant to guarantee that its value fits into `usize`
|
||||
/// without using as much space as a `usize` on all targets, callers must
|
||||
/// not rely on this property for safety. Callers may choose to rely on this
|
||||
/// property for correctness however. For example, creating a `StateID` with an
|
||||
/// invalid value can be done in entirely safe code. This may in turn result in
|
||||
/// panics or silent logical errors.
|
||||
#[derive(Clone, Copy, Default, Eq, Hash, PartialEq, PartialOrd, Ord)]
|
||||
#[repr(transparent)]
|
||||
pub struct StateID(SmallIndex);
|
||||
|
||||
index_type_impls!(PatternID, PatternIDError, PatternIDIter, WithPatternIDIter);
|
||||
index_type_impls!(StateID, StateIDError, StateIDIter, WithStateIDIter);
|
||||
|
||||
/// A utility trait that defines a couple of adapters for making it convenient
|
||||
/// to access indices as "small index" types. We require ExactSizeIterator so
|
||||
/// that iterator construction can do a single check to make sure the index of
|
||||
/// each element is representable by its small index type.
|
||||
pub(crate) trait IteratorIndexExt: Iterator {
|
||||
fn with_pattern_ids(self) -> WithPatternIDIter<Self>
|
||||
where
|
||||
Self: Sized + ExactSizeIterator,
|
||||
{
|
||||
WithPatternIDIter::new(self)
|
||||
}
|
||||
|
||||
fn with_state_ids(self) -> WithStateIDIter<Self>
|
||||
where
|
||||
Self: Sized + ExactSizeIterator,
|
||||
{
|
||||
WithStateIDIter::new(self)
|
||||
}
|
||||
}
|
||||
|
||||
impl<I: Iterator> IteratorIndexExt for I {}
|
214
vendor/aho-corasick/src/util/remapper.rs
vendored
Normal file
214
vendor/aho-corasick/src/util/remapper.rs
vendored
Normal file
@ -0,0 +1,214 @@
|
||||
use alloc::vec::Vec;
|
||||
|
||||
use crate::{nfa::noncontiguous, util::primitives::StateID};
|
||||
|
||||
/// Remappable is a tightly coupled abstraction that facilitates remapping
|
||||
/// state identifiers in DFAs.
|
||||
///
|
||||
/// The main idea behind remapping state IDs is that DFAs often need to check
|
||||
/// if a certain state is a "special" state of some kind (like a match state)
|
||||
/// during a search. Since this is extremely perf critical code, we want this
|
||||
/// check to be as fast as possible. Partitioning state IDs into, for example,
|
||||
/// into "non-match" and "match" states means one can tell if a state is a
|
||||
/// match state via a simple comparison of the state ID.
|
||||
///
|
||||
/// The issue is that during the DFA construction process, it's not
|
||||
/// particularly easy to partition the states. Instead, the simplest thing is
|
||||
/// to often just do a pass over all of the states and shuffle them into their
|
||||
/// desired partitionings. To do that, we need a mechanism for swapping states.
|
||||
/// Hence, this abstraction.
|
||||
///
|
||||
/// Normally, for such little code, I would just duplicate it. But this is a
|
||||
/// key optimization and the implementation is a bit subtle. So the abstraction
|
||||
/// is basically a ham-fisted attempt at DRY. The only place we use this is in
|
||||
/// the dense and one-pass DFAs.
|
||||
///
|
||||
/// See also src/dfa/special.rs for a more detailed explanation of how dense
|
||||
/// DFAs are partitioned.
|
||||
pub(crate) trait Remappable: core::fmt::Debug {
|
||||
/// Return the total number of states.
|
||||
fn state_len(&self) -> usize;
|
||||
|
||||
/// Swap the states pointed to by the given IDs. The underlying finite
|
||||
/// state machine should be mutated such that all of the transitions in
|
||||
/// `id1` are now in the memory region where the transitions for `id2`
|
||||
/// were, and all of the transitions in `id2` are now in the memory region
|
||||
/// where the transitions for `id1` were.
|
||||
///
|
||||
/// Essentially, this "moves" `id1` to `id2` and `id2` to `id1`.
|
||||
///
|
||||
/// It is expected that, after calling this, the underlying state machine
|
||||
/// will be left in an inconsistent state, since any other transitions
|
||||
/// pointing to, e.g., `id1` need to be updated to point to `id2`, since
|
||||
/// that's where `id1` moved to.
|
||||
///
|
||||
/// In order to "fix" the underlying inconsistent state, a `Remapper`
|
||||
/// should be used to guarantee that `remap` is called at the appropriate
|
||||
/// time.
|
||||
fn swap_states(&mut self, id1: StateID, id2: StateID);
|
||||
|
||||
/// This must remap every single state ID in the underlying value according
|
||||
/// to the function given. For example, in a DFA, this should remap every
|
||||
/// transition and every starting state ID.
|
||||
fn remap(&mut self, map: impl Fn(StateID) -> StateID);
|
||||
}
|
||||
|
||||
/// Remapper is an abstraction the manages the remapping of state IDs in a
|
||||
/// finite state machine. This is useful when one wants to shuffle states into
|
||||
/// different positions in the machine.
|
||||
///
|
||||
/// One of the key complexities this manages is the ability to correctly move
|
||||
/// one state multiple times.
|
||||
///
|
||||
/// Once shuffling is complete, `remap` must be called, which will rewrite
|
||||
/// all pertinent transitions to updated state IDs. Neglecting to call `remap`
|
||||
/// will almost certainly result in a corrupt machine.
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct Remapper {
|
||||
/// A map from the index of a state to its pre-multiplied identifier.
|
||||
///
|
||||
/// When a state is swapped with another, then their corresponding
|
||||
/// locations in this map are also swapped. Thus, its new position will
|
||||
/// still point to its old pre-multiplied StateID.
|
||||
///
|
||||
/// While there is a bit more to it, this then allows us to rewrite the
|
||||
/// state IDs in a DFA's transition table in a single pass. This is done
|
||||
/// by iterating over every ID in this map, then iterating over each
|
||||
/// transition for the state at that ID and re-mapping the transition from
|
||||
/// `old_id` to `map[dfa.to_index(old_id)]`. That is, we find the position
|
||||
/// in this map where `old_id` *started*, and set it to where it ended up
|
||||
/// after all swaps have been completed.
|
||||
map: Vec<StateID>,
|
||||
/// A way to map indices to state IDs (and back).
|
||||
idx: IndexMapper,
|
||||
}
|
||||
|
||||
impl Remapper {
|
||||
/// Create a new remapper from the given remappable implementation. The
|
||||
/// remapper can then be used to swap states. The remappable value given
|
||||
/// here must the same one given to `swap` and `remap`.
|
||||
///
|
||||
/// The given stride should be the stride of the transition table expressed
|
||||
/// as a power of 2. This stride is used to map between state IDs and state
|
||||
/// indices. If state IDs and state indices are equivalent, then provide
|
||||
/// a `stride2` of `0`, which acts as an identity.
|
||||
pub(crate) fn new(r: &impl Remappable, stride2: usize) -> Remapper {
|
||||
let idx = IndexMapper { stride2 };
|
||||
let map = (0..r.state_len()).map(|i| idx.to_state_id(i)).collect();
|
||||
Remapper { map, idx }
|
||||
}
|
||||
|
||||
/// Swap two states. Once this is called, callers must follow through to
|
||||
/// call `remap`, or else it's possible for the underlying remappable
|
||||
/// value to be in a corrupt state.
|
||||
pub(crate) fn swap(
|
||||
&mut self,
|
||||
r: &mut impl Remappable,
|
||||
id1: StateID,
|
||||
id2: StateID,
|
||||
) {
|
||||
if id1 == id2 {
|
||||
return;
|
||||
}
|
||||
r.swap_states(id1, id2);
|
||||
self.map.swap(self.idx.to_index(id1), self.idx.to_index(id2));
|
||||
}
|
||||
|
||||
/// Complete the remapping process by rewriting all state IDs in the
|
||||
/// remappable value according to the swaps performed.
|
||||
pub(crate) fn remap(mut self, r: &mut impl Remappable) {
|
||||
// Update the map to account for states that have been swapped
|
||||
// multiple times. For example, if (A, C) and (C, G) are swapped, then
|
||||
// transitions previously pointing to A should now point to G. But if
|
||||
// we don't update our map, they will erroneously be set to C. All we
|
||||
// do is follow the swaps in our map until we see our original state
|
||||
// ID.
|
||||
//
|
||||
// The intuition here is to think about how changes are made to the
|
||||
// map: only through pairwise swaps. That means that starting at any
|
||||
// given state, it is always possible to find the loop back to that
|
||||
// state by following the swaps represented in the map (which might be
|
||||
// 0 swaps).
|
||||
//
|
||||
// We are also careful to clone the map before starting in order to
|
||||
// freeze it. We use the frozen map to find our loops, since we need to
|
||||
// update our map as well. Without freezing it, our updates could break
|
||||
// the loops referenced above and produce incorrect results.
|
||||
let oldmap = self.map.clone();
|
||||
for i in 0..r.state_len() {
|
||||
let cur_id = self.idx.to_state_id(i);
|
||||
let mut new_id = oldmap[i];
|
||||
if cur_id == new_id {
|
||||
continue;
|
||||
}
|
||||
loop {
|
||||
let id = oldmap[self.idx.to_index(new_id)];
|
||||
if cur_id == id {
|
||||
self.map[i] = new_id;
|
||||
break;
|
||||
}
|
||||
new_id = id;
|
||||
}
|
||||
}
|
||||
r.remap(|sid| self.map[self.idx.to_index(sid)]);
|
||||
}
|
||||
}
|
||||
|
||||
/// A simple type for mapping between state indices and state IDs.
|
||||
///
|
||||
/// The reason why this exists is because state IDs are "premultiplied" in a
|
||||
/// DFA. That is, in order to get to the transitions for a particular state,
|
||||
/// one need only use the state ID as-is, instead of having to multiply it by
|
||||
/// transition table's stride.
|
||||
///
|
||||
/// The downside of this is that it's inconvenient to map between state IDs
|
||||
/// using a dense map, e.g., Vec<StateID>. That's because state IDs look like
|
||||
/// `0`, `stride`, `2*stride`, `3*stride`, etc., instead of `0`, `1`, `2`, `3`,
|
||||
/// etc.
|
||||
///
|
||||
/// Since our state IDs are premultiplied, we can convert back-and-forth
|
||||
/// between IDs and indices by simply unmultiplying the IDs and multiplying the
|
||||
/// indices.
|
||||
///
|
||||
/// Note that for a sparse NFA, state IDs and indices are equivalent. In this
|
||||
/// case, we set the stride of the index mapped to be `0`, which acts as an
|
||||
/// identity.
|
||||
#[derive(Debug)]
|
||||
struct IndexMapper {
|
||||
/// The power of 2 corresponding to the stride of the corresponding
|
||||
/// transition table. 'id >> stride2' de-multiplies an ID while 'index <<
|
||||
/// stride2' pre-multiplies an index to an ID.
|
||||
stride2: usize,
|
||||
}
|
||||
|
||||
impl IndexMapper {
|
||||
/// Convert a state ID to a state index.
|
||||
fn to_index(&self, id: StateID) -> usize {
|
||||
id.as_usize() >> self.stride2
|
||||
}
|
||||
|
||||
/// Convert a state index to a state ID.
|
||||
fn to_state_id(&self, index: usize) -> StateID {
|
||||
// CORRECTNESS: If the given index is not valid, then it is not
|
||||
// required for this to panic or return a valid state ID. We'll "just"
|
||||
// wind up with panics or silent logic errors at some other point. But
|
||||
// this is OK because if Remappable::state_len is correct and so is
|
||||
// 'to_index', then all inputs to 'to_state_id' should be valid indices
|
||||
// and thus transform into valid state IDs.
|
||||
StateID::new_unchecked(index << self.stride2)
|
||||
}
|
||||
}
|
||||
|
||||
impl Remappable for noncontiguous::NFA {
|
||||
fn state_len(&self) -> usize {
|
||||
noncontiguous::NFA::states(self).len()
|
||||
}
|
||||
|
||||
fn swap_states(&mut self, id1: StateID, id2: StateID) {
|
||||
noncontiguous::NFA::swap_states(self, id1, id2)
|
||||
}
|
||||
|
||||
fn remap(&mut self, map: impl Fn(StateID) -> StateID) {
|
||||
noncontiguous::NFA::remap(self, map)
|
||||
}
|
||||
}
|
1148
vendor/aho-corasick/src/util/search.rs
vendored
Normal file
1148
vendor/aho-corasick/src/util/search.rs
vendored
Normal file
File diff suppressed because it is too large
Load Diff
42
vendor/aho-corasick/src/util/special.rs
vendored
Normal file
42
vendor/aho-corasick/src/util/special.rs
vendored
Normal file
@ -0,0 +1,42 @@
|
||||
use crate::util::primitives::StateID;
|
||||
|
||||
/// A collection of sentinel state IDs for Aho-Corasick automata.
|
||||
///
|
||||
/// This specifically enables the technique by which we determine which states
|
||||
/// are dead, matches or start states. Namely, by arranging states in a
|
||||
/// particular order, we can determine the type of a state simply by looking at
|
||||
/// its ID.
|
||||
#[derive(Clone, Debug)]
|
||||
pub(crate) struct Special {
|
||||
/// The maximum ID of all the "special" states. This corresponds either to
|
||||
/// start_anchored_id when a prefilter is active and max_match_id when a
|
||||
/// prefilter is not active. The idea here is that if there is no prefilter,
|
||||
/// then there is no point in treating start states as special.
|
||||
pub(crate) max_special_id: StateID,
|
||||
/// The maximum ID of all the match states. Any state ID bigger than this
|
||||
/// is guaranteed to be a non-match ID.
|
||||
///
|
||||
/// It is possible and legal for max_match_id to be equal to
|
||||
/// start_anchored_id, which occurs precisely in the case where the empty
|
||||
/// string is a pattern that was added to the underlying automaton.
|
||||
pub(crate) max_match_id: StateID,
|
||||
/// The state ID of the start state used for unanchored searches.
|
||||
pub(crate) start_unanchored_id: StateID,
|
||||
/// The state ID of the start state used for anchored searches. This is
|
||||
/// always start_unanchored_id+1.
|
||||
pub(crate) start_anchored_id: StateID,
|
||||
}
|
||||
|
||||
impl Special {
|
||||
/// Create a new set of "special" state IDs with all IDs initialized to
|
||||
/// zero. The general idea here is that they will be updated and set to
|
||||
/// correct values later.
|
||||
pub(crate) fn zero() -> Special {
|
||||
Special {
|
||||
max_special_id: StateID::ZERO,
|
||||
max_match_id: StateID::ZERO,
|
||||
start_unanchored_id: StateID::ZERO,
|
||||
start_anchored_id: StateID::ZERO,
|
||||
}
|
||||
}
|
||||
}
|
1
vendor/android-tzdata/.cargo-checksum.json
vendored
Normal file
1
vendor/android-tzdata/.cargo-checksum.json
vendored
Normal file
@ -0,0 +1 @@
|
||||
{"files":{"Cargo.toml":"a87d9acc9827a50c7a96a88720c5dd055cbc08b1144dff95bd572ff977d4a79a","LICENSE-APACHE":"4458503dd48e88c4e0b945fb252a08b93c40ec757309b8ffa7c594dfa1e35104","LICENSE-MIT":"002c2696d92b5c8cf956c11072baa58eaf9f6ade995c031ea635c6a1ee342ad1","README.md":"6dfe0c602dc61eebe118900ed66a2c1f7887b9fe95b36e1c2974c4e8fa7ebd4b","src/lib.rs":"8f421233df83f82e737930ca8a2ad254966334183148bcc170f9c405df230de2","src/tzdata.rs":"78920925b04219910511e9a1f036f468cd2925c0054f280d6a00b106529046e7"},"package":"e999941b234f3131b00bc13c22d06e8c5ff726d1b6318ac7eb276997bbb4fef0"}
|
34
vendor/android-tzdata/Cargo.toml
vendored
Normal file
34
vendor/android-tzdata/Cargo.toml
vendored
Normal file
@ -0,0 +1,34 @@
|
||||
# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO
|
||||
#
|
||||
# When uploading crates to the registry Cargo will automatically
|
||||
# "normalize" Cargo.toml files for maximal compatibility
|
||||
# with all versions of Cargo and also rewrite `path` dependencies
|
||||
# to registry (e.g., crates.io) dependencies.
|
||||
#
|
||||
# If you are reading this file be aware that the original Cargo.toml
|
||||
# will likely look very different (and much more reasonable).
|
||||
# See Cargo.toml.orig for the original contents.
|
||||
|
||||
[package]
|
||||
edition = "2018"
|
||||
name = "android-tzdata"
|
||||
version = "0.1.1"
|
||||
authors = ["RumovZ"]
|
||||
include = [
|
||||
"src/**/*",
|
||||
"LICENSE-*",
|
||||
"README.md",
|
||||
]
|
||||
description = "Parser for the Android-specific tzdata file"
|
||||
readme = "README.md"
|
||||
keywords = [
|
||||
"parser",
|
||||
"android",
|
||||
"timezone",
|
||||
]
|
||||
categories = ["date-and-time"]
|
||||
license = "MIT OR Apache-2.0"
|
||||
repository = "https://github.com/RumovZ/android-tzdata"
|
||||
|
||||
[dev-dependencies.zip]
|
||||
version = "0.6.4"
|
201
vendor/android-tzdata/LICENSE-APACHE
vendored
Normal file
201
vendor/android-tzdata/LICENSE-APACHE
vendored
Normal file
@ -0,0 +1,201 @@
|
||||
Apache License
|
||||
Version 2.0, January 2004
|
||||
http://www.apache.org/licenses/
|
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
1. Definitions.
|
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction,
|
||||
and distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by
|
||||
the copyright owner that is granting the License.
|
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all
|
||||
other entities that control, are controlled by, or are under common
|
||||
control with that entity. For the purposes of this definition,
|
||||
"control" means (i) the power, direct or indirect, to cause the
|
||||
direction or management of such entity, whether by contract or
|
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity
|
||||
exercising permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications,
|
||||
including but not limited to software source code, documentation
|
||||
source, and configuration files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical
|
||||
transformation or translation of a Source form, including but
|
||||
not limited to compiled object code, generated documentation,
|
||||
and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or
|
||||
Object form, made available under the License, as indicated by a
|
||||
copyright notice that is included in or attached to the work
|
||||
(an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object
|
||||
form, that is based on (or derived from) the Work and for which the
|
||||
editorial revisions, annotations, elaborations, or other modifications
|
||||
represent, as a whole, an original work of authorship. For the purposes
|
||||
of this License, Derivative Works shall not include works that remain
|
||||
separable from, or merely link (or bind by name) to the interfaces of,
|
||||
the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including
|
||||
the original version of the Work and any modifications or additions
|
||||
to that Work or Derivative Works thereof, that is intentionally
|
||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||
or by an individual or Legal Entity authorized to submit on behalf of
|
||||
the copyright owner. For the purposes of this definition, "submitted"
|
||||
means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems,
|
||||
and issue tracking systems that are managed by, or on behalf of, the
|
||||
Licensor for the purpose of discussing and improving the Work, but
|
||||
excluding communication that is conspicuously marked or otherwise
|
||||
designated in writing by the copyright owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||
on behalf of whom a Contribution has been received by Licensor and
|
||||
subsequently incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
copyright license to reproduce, prepare Derivative Works of,
|
||||
publicly display, publicly perform, sublicense, and distribute the
|
||||
Work and such Derivative Works in Source or Object form.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
(except as stated in this section) patent license to make, have made,
|
||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||
where such license applies only to those patent claims licensable
|
||||
by such Contributor that are necessarily infringed by their
|
||||
Contribution(s) alone or by combination of their Contribution(s)
|
||||
with the Work to which such Contribution(s) was submitted. If You
|
||||
institute patent litigation against any entity (including a
|
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||
or a Contribution incorporated within the Work constitutes direct
|
||||
or contributory patent infringement, then any patent licenses
|
||||
granted to You under this License for that Work shall terminate
|
||||
as of the date such litigation is filed.
|
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the
|
||||
Work or Derivative Works thereof in any medium, with or without
|
||||
modifications, and in Source or Object form, provided that You
|
||||
meet the following conditions:
|
||||
|
||||
(a) You must give any other recipients of the Work or
|
||||
Derivative Works a copy of this License; and
|
||||
|
||||
(b) You must cause any modified files to carry prominent notices
|
||||
stating that You changed the files; and
|
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works
|
||||
that You distribute, all copyright, patent, trademark, and
|
||||
attribution notices from the Source form of the Work,
|
||||
excluding those notices that do not pertain to any part of
|
||||
the Derivative Works; and
|
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its
|
||||
distribution, then any Derivative Works that You distribute must
|
||||
include a readable copy of the attribution notices contained
|
||||
within such NOTICE file, excluding those notices that do not
|
||||
pertain to any part of the Derivative Works, in at least one
|
||||
of the following places: within a NOTICE text file distributed
|
||||
as part of the Derivative Works; within the Source form or
|
||||
documentation, if provided along with the Derivative Works; or,
|
||||
within a display generated by the Derivative Works, if and
|
||||
wherever such third-party notices normally appear. The contents
|
||||
of the NOTICE file are for informational purposes only and
|
||||
do not modify the License. You may add Your own attribution
|
||||
notices within Derivative Works that You distribute, alongside
|
||||
or as an addendum to the NOTICE text from the Work, provided
|
||||
that such additional attribution notices cannot be construed
|
||||
as modifying the License.
|
||||
|
||||
You may add Your own copyright statement to Your modifications and
|
||||
may provide additional or different license terms and conditions
|
||||
for use, reproduction, or distribution of Your modifications, or
|
||||
for any such Derivative Works as a whole, provided Your use,
|
||||
reproduction, and distribution of the Work otherwise complies with
|
||||
the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||
any Contribution intentionally submitted for inclusion in the Work
|
||||
by You to the Licensor shall be under the terms and conditions of
|
||||
this License, without any additional terms or conditions.
|
||||
Notwithstanding the above, nothing herein shall supersede or modify
|
||||
the terms of any separate license agreement you may have executed
|
||||
with Licensor regarding such Contributions.
|
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade
|
||||
names, trademarks, service marks, or product names of the Licensor,
|
||||
except as required for reasonable and customary use in describing the
|
||||
origin of the Work and reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||
agreed to in writing, Licensor provides the Work (and each
|
||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
implied, including, without limitation, any warranties or conditions
|
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Work and assume any
|
||||
risks associated with Your exercise of permissions under this License.
|
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory,
|
||||
whether in tort (including negligence), contract, or otherwise,
|
||||
unless required by applicable law (such as deliberate and grossly
|
||||
negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special,
|
||||
incidental, or consequential damages of any character arising as a
|
||||
result of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill,
|
||||
work stoppage, computer failure or malfunction, or any and all
|
||||
other commercial damages or losses), even if such Contributor
|
||||
has been advised of the possibility of such damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing
|
||||
the Work or Derivative Works thereof, You may choose to offer,
|
||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||
or other liability obligations and/or rights consistent with this
|
||||
License. However, in accepting such obligations, You may act only
|
||||
on Your own behalf and on Your sole responsibility, not on behalf
|
||||
of any other Contributor, and only if You agree to indemnify,
|
||||
defend, and hold each Contributor harmless for any liability
|
||||
incurred by, or claims asserted against, such Contributor by reason
|
||||
of your accepting any such warranty or additional liability.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
APPENDIX: How to apply the Apache License to your work.
|
||||
|
||||
To apply the Apache License to your work, attach the following
|
||||
boilerplate notice, with the fields enclosed by brackets "[]"
|
||||
replaced with your own identifying information. (Don't include
|
||||
the brackets!) The text should be enclosed in the appropriate
|
||||
comment syntax for the file format. We also recommend that a
|
||||
file or class name and description of purpose be included on the
|
||||
same "printed page" as the copyright notice for easier
|
||||
identification within third-party archives.
|
||||
|
||||
Copyright [yyyy] [name of copyright owner]
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
21
vendor/android-tzdata/LICENSE-MIT
vendored
Normal file
21
vendor/android-tzdata/LICENSE-MIT
vendored
Normal file
@ -0,0 +1,21 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) [year] [fullname]
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
20
vendor/android-tzdata/README.md
vendored
Normal file
20
vendor/android-tzdata/README.md
vendored
Normal file
@ -0,0 +1,20 @@
|
||||
# android-tzdata
|
||||
|
||||
Parser for the Android-specific tzdata file.
|
||||
|
||||
## License
|
||||
|
||||
Licensed under either of
|
||||
|
||||
- Apache License, Version 2.0
|
||||
([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
|
||||
- MIT license
|
||||
([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
|
||||
|
||||
at your option.
|
||||
|
||||
## Contribution
|
||||
|
||||
Unless you explicitly state otherwise, any contribution intentionally submitted
|
||||
for inclusion in the work by you, as defined in the Apache-2.0 license, shall be
|
||||
dual licensed as above, without any additional terms or conditions.
|
29
vendor/android-tzdata/src/lib.rs
vendored
Normal file
29
vendor/android-tzdata/src/lib.rs
vendored
Normal file
@ -0,0 +1,29 @@
|
||||
//! Parser for the Android-specific tzdata file.
|
||||
|
||||
mod tzdata;
|
||||
|
||||
/// Tries to locate the `tzdata` file, parse it, and return the entry for the
|
||||
/// requested time zone.
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// Returns an [std::io::Error] if the `tzdata` file cannot be found and parsed, or
|
||||
/// if it does not contain the requested timezone entry.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```rust
|
||||
/// # use std::error::Error;
|
||||
/// # use android_tzdata::find_tz_data;
|
||||
/// #
|
||||
/// # fn main() -> Result<(), Box<dyn Error>> {
|
||||
/// let tz_data = find_tz_data("Europe/Kiev")?;
|
||||
/// // Check it's version 2 of the [Time Zone Information Format](https://www.ietf.org/archive/id/draft-murchison-rfc8536bis-02.html).
|
||||
/// assert!(tz_data.starts_with(b"TZif2"));
|
||||
/// # Ok(())
|
||||
/// # }
|
||||
/// ```
|
||||
pub fn find_tz_data(tz_name: impl AsRef<str>) -> Result<Vec<u8>, std::io::Error> {
|
||||
let mut file = tzdata::find_file()?;
|
||||
tzdata::find_tz_data_in_file(&mut file, tz_name.as_ref())
|
||||
}
|
166
vendor/android-tzdata/src/tzdata.rs
vendored
Normal file
166
vendor/android-tzdata/src/tzdata.rs
vendored
Normal file
@ -0,0 +1,166 @@
|
||||
//! Logic was mainly ported from https://android.googlesource.com/platform/libcore/+/jb-mr2-release/luni/src/main/java/libcore/util/ZoneInfoDB.java
|
||||
|
||||
use core::{cmp::Ordering, convert::TryInto};
|
||||
use std::{
|
||||
fs::File,
|
||||
io::{self, ErrorKind, Read, Seek, SeekFrom},
|
||||
};
|
||||
|
||||
// The database uses 32-bit (4 byte) integers.
|
||||
const TZ_INT_SIZE: usize = 4;
|
||||
// The first 12 bytes contain a special version string.
|
||||
const MAGIC_SIZE: usize = 12;
|
||||
const HEADER_SIZE: usize = MAGIC_SIZE + 3 * TZ_INT_SIZE;
|
||||
// The database reserves 40 bytes for each id.
|
||||
const TZ_NAME_SIZE: usize = 40;
|
||||
const INDEX_ENTRY_SIZE: usize = TZ_NAME_SIZE + 3 * TZ_INT_SIZE;
|
||||
const TZDATA_LOCATIONS: [TzdataLocation; 2] = [
|
||||
TzdataLocation {
|
||||
env_var: "ANDROID_DATA",
|
||||
path: "/misc/zoneinfo/",
|
||||
},
|
||||
TzdataLocation {
|
||||
env_var: "ANDROID_ROOT",
|
||||
path: "/usr/share/zoneinfo/",
|
||||
},
|
||||
];
|
||||
|
||||
#[derive(Debug)]
|
||||
struct TzdataLocation {
|
||||
env_var: &'static str,
|
||||
path: &'static str,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
struct Header {
|
||||
index_offset: usize,
|
||||
data_offset: usize,
|
||||
_zonetab_offset: usize,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct Index(Vec<u8>);
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
struct IndexEntry<'a> {
|
||||
_name: &'a [u8],
|
||||
offset: usize,
|
||||
length: usize,
|
||||
_raw_utc_offset: usize,
|
||||
}
|
||||
|
||||
pub(super) fn find_file() -> Result<File, io::Error> {
|
||||
for location in &TZDATA_LOCATIONS {
|
||||
if let Ok(env_value) = std::env::var(location.env_var) {
|
||||
if let Ok(file) = File::open(format!("{}{}tzdata", env_value, location.path)) {
|
||||
return Ok(file);
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(io::Error::from(io::ErrorKind::NotFound))
|
||||
}
|
||||
|
||||
pub(super) fn find_tz_data_in_file(
|
||||
mut file: impl Read + Seek,
|
||||
tz_name: &str,
|
||||
) -> Result<Vec<u8>, io::Error> {
|
||||
let header = Header::new(&mut file)?;
|
||||
let index = Index::new(&mut file, header)?;
|
||||
if let Some(entry) = index.find_entry(tz_name) {
|
||||
file.seek(SeekFrom::Start((entry.offset + header.data_offset) as u64))?;
|
||||
let mut tz_data = vec![0u8; entry.length];
|
||||
file.read_exact(&mut tz_data)?;
|
||||
Ok(tz_data)
|
||||
} else {
|
||||
Err(io::Error::from(ErrorKind::NotFound))
|
||||
}
|
||||
}
|
||||
|
||||
impl Header {
|
||||
fn new(mut file: impl Read + Seek) -> Result<Self, io::Error> {
|
||||
let mut buf = [0; HEADER_SIZE];
|
||||
file.read_exact(&mut buf)?;
|
||||
if !buf.starts_with(b"tzdata") || buf[MAGIC_SIZE - 1] != 0u8 {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidData,
|
||||
"invalid magic number",
|
||||
));
|
||||
}
|
||||
Ok(Self {
|
||||
index_offset: parse_tz_int(&buf, MAGIC_SIZE) as usize,
|
||||
data_offset: parse_tz_int(&buf, MAGIC_SIZE + TZ_INT_SIZE) as usize,
|
||||
_zonetab_offset: parse_tz_int(&buf, MAGIC_SIZE + 2 * TZ_INT_SIZE) as usize,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl Index {
|
||||
fn new(mut file: impl Read + Seek, header: Header) -> Result<Self, io::Error> {
|
||||
file.seek(SeekFrom::Start(header.index_offset as u64))?;
|
||||
let size = header.data_offset - header.index_offset;
|
||||
let mut bytes = vec![0; size];
|
||||
file.read_exact(&mut bytes)?;
|
||||
Ok(Self(bytes))
|
||||
}
|
||||
|
||||
fn find_entry(&self, name: &str) -> Option<IndexEntry> {
|
||||
let name_bytes = name.as_bytes();
|
||||
let name_len = name_bytes.len();
|
||||
if name_len > TZ_NAME_SIZE {
|
||||
return None;
|
||||
}
|
||||
|
||||
let zeros = [0u8; TZ_NAME_SIZE];
|
||||
let cmp = |chunk: &&[u8]| -> Ordering {
|
||||
// tz names always have TZ_NAME_SIZE bytes and are right-padded with 0s
|
||||
// so we check that a chunk starts with `name` and the remaining bytes are 0
|
||||
chunk[..name_len]
|
||||
.cmp(name_bytes)
|
||||
.then_with(|| chunk[name_len..TZ_NAME_SIZE].cmp(&zeros[name_len..]))
|
||||
};
|
||||
|
||||
let chunks: Vec<_> = self.0.chunks_exact(INDEX_ENTRY_SIZE).collect();
|
||||
chunks
|
||||
.binary_search_by(cmp)
|
||||
.map(|idx| IndexEntry::new(chunks[idx]))
|
||||
.ok()
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> IndexEntry<'a> {
|
||||
fn new(bytes: &'a [u8]) -> Self {
|
||||
Self {
|
||||
_name: bytes[..TZ_NAME_SIZE]
|
||||
.splitn(2, |&b| b == 0u8)
|
||||
.next()
|
||||
.unwrap(),
|
||||
offset: parse_tz_int(bytes, TZ_NAME_SIZE) as usize,
|
||||
length: parse_tz_int(bytes, TZ_NAME_SIZE + TZ_INT_SIZE) as usize,
|
||||
_raw_utc_offset: parse_tz_int(bytes, TZ_NAME_SIZE + 2 * TZ_INT_SIZE) as usize,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Panics if slice does not contain [TZ_INT_SIZE] bytes beginning at start.
|
||||
fn parse_tz_int(slice: &[u8], start: usize) -> u32 {
|
||||
u32::from_be_bytes(slice[start..start + TZ_INT_SIZE].try_into().unwrap())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::*;
|
||||
use std::fs::File;
|
||||
use std::io::Cursor;
|
||||
|
||||
#[test]
|
||||
fn parse() {
|
||||
let mut archive = File::open("tests/resources/tzdata.zip").unwrap();
|
||||
let mut zip = zip::ZipArchive::new(&mut archive).unwrap();
|
||||
let mut file = zip.by_index(0).unwrap();
|
||||
let mut data = Vec::new();
|
||||
file.read_to_end(&mut data).unwrap();
|
||||
let cursor = Cursor::new(data);
|
||||
let tz = find_tz_data_in_file(cursor, "Europe/Kiev").unwrap();
|
||||
assert!(tz.starts_with(b"TZif2"));
|
||||
}
|
||||
}
|
1
vendor/android_system_properties/.cargo-checksum.json
vendored
Normal file
1
vendor/android_system_properties/.cargo-checksum.json
vendored
Normal file
@ -0,0 +1 @@
|
||||
{"files":{"CONTRIBUTING.md":"0834cb3b5e092977688d73d219a05bed23ae0ecb54b6d6e5d866ce07f6583b5e","Cargo.lock":"37ffc00dbbbec58fd27b4f4cb597e5402d6cf615ce0458f62a73a7f0d987e5bd","Cargo.toml":"e9e8c037cdef7adc9794b17c13e5a014421524d67ea5048bc09cf70ef13c782c","LICENSE-APACHE":"216486f29671a4262efe32af6d84a75bef398127f8c5f369b5c8305983887a06","LICENSE-MIT":"80f275e90d799911ed3830a7f242a2ef5a4ade2092fe0aa07bfb2d2cf2f2b95e","README.md":"aba8ff5dbd0712326d97d32bc6a3b66b24d1980a446c238f7e14b96784766cd1","examples/time_zone.rs":"8edb32a946ef2680146ba9ac16c233dd94391ac9f98464e9fb6f87d3954b72a9","src/lib.rs":"0004133d6c3805bf449e7183d2931e9640167511bea6cd12b400805073c4305d"},"package":"819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311"}
|
40
vendor/android_system_properties/CONTRIBUTING.md
vendored
Normal file
40
vendor/android_system_properties/CONTRIBUTING.md
vendored
Normal file
@ -0,0 +1,40 @@
|
||||
# Contributing
|
||||
|
||||
Contributions are very much welcome. Here are the guidelines if you are thinking of helping us:
|
||||
|
||||
## Contributions
|
||||
|
||||
Contributions should be made in the form of GitHub pull requests.
|
||||
Each pull request will be reviewed by a core contributor (someone with
|
||||
permission to land patches) and either landed in the main tree or
|
||||
given feedback for changes that would be required.
|
||||
|
||||
Should you wish to work on an issue, please claim it first by commenting on
|
||||
the GitHub issue that you want to work on it. This is to prevent duplicated
|
||||
efforts from contributors on the same issue.
|
||||
|
||||
## Pull Request Checklist
|
||||
|
||||
- Branch from the master branch and, if needed, rebase to the current master
|
||||
branch before submitting your pull request. If it doesn't merge cleanly with
|
||||
master you may be asked to rebase your changes.
|
||||
|
||||
- Commits should be as small as possible, while ensuring that each commit is
|
||||
correct independently (i.e., each commit should compile and pass tests).
|
||||
|
||||
- If your patch is not getting reviewed or you need a specific person to review
|
||||
it, you can @-reply a reviewer asking for a review in the pull request or a
|
||||
comment.
|
||||
|
||||
- Whenever applicable, add tests relevant to the fixed bug or new feature.
|
||||
|
||||
For specific git instructions, see [GitHub workflow 101](https://github.com/servo/servo/wiki/Github-workflow).
|
||||
|
||||
## Conduct
|
||||
|
||||
We follow the [Rust Code of Conduct](http://www.rust-lang.org/conduct.html).
|
||||
For escalation or moderation issues, please contact Nical (nical@fastmail.com) instead of the Rust moderation team.
|
||||
|
||||
## License
|
||||
|
||||
Unless you explicitly state otherwise, any contribution intentionally submitted for inclusion in the work by you, as defined in the Apache-2.0 license, shall be licensed dual MIT/Apache 2, without any additional terms or conditions.
|
16
vendor/android_system_properties/Cargo.lock
generated
vendored
Normal file
16
vendor/android_system_properties/Cargo.lock
generated
vendored
Normal file
@ -0,0 +1,16 @@
|
||||
# This file is automatically @generated by Cargo.
|
||||
# It is not intended for manual editing.
|
||||
version = 3
|
||||
|
||||
[[package]]
|
||||
name = "android_system_properties"
|
||||
version = "0.1.5"
|
||||
dependencies = [
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "libc"
|
||||
version = "0.2.126"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "349d5a591cd28b49e1d1037471617a32ddcda5731b99419008085f72d5a53836"
|
36
vendor/android_system_properties/Cargo.toml
vendored
Normal file
36
vendor/android_system_properties/Cargo.toml
vendored
Normal file
@ -0,0 +1,36 @@
|
||||
# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO
|
||||
#
|
||||
# When uploading crates to the registry Cargo will automatically
|
||||
# "normalize" Cargo.toml files for maximal compatibility
|
||||
# with all versions of Cargo and also rewrite `path` dependencies
|
||||
# to registry (e.g., crates.io) dependencies.
|
||||
#
|
||||
# If you are reading this file be aware that the original Cargo.toml
|
||||
# will likely look very different (and much more reasonable).
|
||||
# See Cargo.toml.orig for the original contents.
|
||||
|
||||
[package]
|
||||
edition = "2018"
|
||||
name = "android_system_properties"
|
||||
version = "0.1.5"
|
||||
authors = ["Nicolas Silva <nical@fastmail.com>"]
|
||||
description = "Minimal Android system properties wrapper"
|
||||
homepage = "https://github.com/nical/android_system_properties"
|
||||
documentation = "https://docs.rs/android_system_properties"
|
||||
readme = "README.md"
|
||||
keywords = ["android"]
|
||||
license = "MIT/Apache-2.0"
|
||||
repository = "https://github.com/nical/android_system_properties"
|
||||
|
||||
[package.metadata.docs.rs]
|
||||
targets = [
|
||||
"arm-linux-androideabi",
|
||||
"armv7-linux-androideabi",
|
||||
"aarch64-linux-android",
|
||||
"i686-linux-android",
|
||||
"x86_64-linux-android",
|
||||
"x86_64-unknown-linux-gnu",
|
||||
]
|
||||
|
||||
[dependencies.libc]
|
||||
version = "0.2.126"
|
13
vendor/android_system_properties/LICENSE-APACHE
vendored
Normal file
13
vendor/android_system_properties/LICENSE-APACHE
vendored
Normal file
@ -0,0 +1,13 @@
|
||||
Copyright 2016 Nicolas Silva
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
20
vendor/android_system_properties/LICENSE-MIT
vendored
Normal file
20
vendor/android_system_properties/LICENSE-MIT
vendored
Normal file
@ -0,0 +1,20 @@
|
||||
The MIT License (MIT)
|
||||
|
||||
Copyright (c) 2013 Nicolas Silva
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy of
|
||||
this software and associated documentation files (the "Software"), to deal in
|
||||
the Software without restriction, including without limitation the rights to
|
||||
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
|
||||
the Software, and to permit persons to whom the Software is furnished to do so,
|
||||
subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
|
||||
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
|
||||
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
|
||||
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
36
vendor/android_system_properties/README.md
vendored
Normal file
36
vendor/android_system_properties/README.md
vendored
Normal file
@ -0,0 +1,36 @@
|
||||
# android_system_properties
|
||||
|
||||
A thin rust wrapper for Android system properties.
|
||||
|
||||
This crate is similar to the `android-properties` crate with the exception that
|
||||
the necessary Android libc symbols are loaded dynamically instead of linked
|
||||
statically. In practice this means that the same binary will work with old and
|
||||
new versions of Android, even though the API for reading system properties changed
|
||||
around Android L.
|
||||
|
||||
## Example
|
||||
|
||||
```rust
|
||||
use android_system_properties::AndroidSystemProperties;
|
||||
|
||||
let properties = AndroidSystemProperties::new();
|
||||
|
||||
if let Some(value) = properties.get("persist.sys.timezone") {
|
||||
println!("{}", value);
|
||||
}
|
||||
```
|
||||
|
||||
## Listing and setting properties
|
||||
|
||||
For the sake of simplicity this crate currently only contains what's needed by wgpu.
|
||||
The implementations for listing and setting properties can be added back if anyone needs
|
||||
them (let me know by filing an issue).
|
||||
|
||||
## License
|
||||
|
||||
Licensed under either of
|
||||
|
||||
* Apache License, Version 2.0 ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
|
||||
* MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
|
||||
|
||||
at your option.
|
9
vendor/android_system_properties/examples/time_zone.rs
vendored
Normal file
9
vendor/android_system_properties/examples/time_zone.rs
vendored
Normal file
@ -0,0 +1,9 @@
|
||||
/// Prints the current time zone, e.g. "Europe/Paris".
|
||||
|
||||
use android_system_properties::AndroidSystemProperties;
|
||||
|
||||
fn main() {
|
||||
let android_system_properties = AndroidSystemProperties::new();
|
||||
let tz = android_system_properties.get("persist.sys.timezone");
|
||||
println!("Your time zone is: {}", tz.as_deref().unwrap_or("<unknown>"));
|
||||
}
|
221
vendor/android_system_properties/src/lib.rs
vendored
Normal file
221
vendor/android_system_properties/src/lib.rs
vendored
Normal file
@ -0,0 +1,221 @@
|
||||
//! A thin rust wrapper for Android system properties.
|
||||
//!
|
||||
//! This crate is similar to the `android-properties` crate with the exception that
|
||||
//! the necessary Android libc symbols are loaded dynamically instead of linked
|
||||
//! statically. In practice this means that the same binary will work with old and
|
||||
//! new versions of Android, even though the API for reading system properties changed
|
||||
//! around Android L.
|
||||
//!
|
||||
//! ## Example
|
||||
//!
|
||||
//! ```rust
|
||||
//! use android_system_properties::AndroidSystemProperties;
|
||||
//!
|
||||
//! let properties = AndroidSystemProperties::new();
|
||||
//!
|
||||
//! if let Some(value) = properties.get("persist.sys.timezone") {
|
||||
//! println!("{}", value);
|
||||
//! }
|
||||
//! ```
|
||||
//!
|
||||
//! ## Listing and setting properties
|
||||
//!
|
||||
//! For the sake of simplicity this crate currently only contains what's needed by wgpu.
|
||||
//! The implementations for listing and setting properties can be added back if anyone needs
|
||||
//! them (let me know by filing an issue).
|
||||
//!
|
||||
//! ## License
|
||||
//!
|
||||
//! Licensed under either of
|
||||
//!
|
||||
//! * Apache License, Version 2.0 ([LICENSE-APACHE] or <http://www.apache.org/licenses/LICENSE-2.0>)
|
||||
//! * MIT license ([LICENSE-MIT] or <http://opensource.org/licenses/MIT>)
|
||||
//!
|
||||
//! at your option.
|
||||
//!
|
||||
//! [LICENSE-APACHE]: https://github.com/nical/android_system_properties/blob/804681c5c1c93d4fab29c1a2f47b7d808dc70fd3/LICENSE-APACHE
|
||||
//! [LICENSE-MIT]: https://github.com/nical/android_system_properties/blob/804681c5c1c93d4fab29c1a2f47b7d808dc70fd3/LICENSE-MIT
|
||||
|
||||
use std::{
|
||||
ffi::{CStr, CString},
|
||||
os::raw::{c_char, c_int, c_void},
|
||||
};
|
||||
|
||||
#[cfg(target_os = "android")]
|
||||
use std::mem;
|
||||
|
||||
unsafe fn property_callback(payload: *mut String, _name: *const c_char, value: *const c_char, _serial: u32) {
|
||||
let cvalue = CStr::from_ptr(value);
|
||||
(*payload) = cvalue.to_str().unwrap().to_string();
|
||||
}
|
||||
|
||||
type Callback = unsafe fn(*mut String, *const c_char, *const c_char, u32);
|
||||
|
||||
type SystemPropertyGetFn = unsafe extern "C" fn(*const c_char, *mut c_char) -> c_int;
|
||||
type SystemPropertyFindFn = unsafe extern "C" fn(*const c_char) -> *const c_void;
|
||||
type SystemPropertyReadCallbackFn = unsafe extern "C" fn(*const c_void, Callback, *mut String) -> *const c_void;
|
||||
|
||||
#[derive(Debug)]
|
||||
/// An object that can retrieve android system properties.
|
||||
///
|
||||
/// ## Example
|
||||
///
|
||||
/// ```
|
||||
/// use android_system_properties::AndroidSystemProperties;
|
||||
///
|
||||
/// let properties = AndroidSystemProperties::new();
|
||||
///
|
||||
/// if let Some(value) = properties.get("persist.sys.timezone") {
|
||||
/// println!("{}", value);
|
||||
/// }
|
||||
/// ```
|
||||
pub struct AndroidSystemProperties {
|
||||
libc_so: *mut c_void,
|
||||
get_fn: Option<SystemPropertyGetFn>,
|
||||
find_fn: Option<SystemPropertyFindFn>,
|
||||
read_callback_fn: Option<SystemPropertyReadCallbackFn>,
|
||||
}
|
||||
|
||||
unsafe impl Send for AndroidSystemProperties {}
|
||||
unsafe impl Sync for AndroidSystemProperties {}
|
||||
|
||||
impl AndroidSystemProperties {
|
||||
#[cfg(not(target_os = "android"))]
|
||||
/// Create an entry point for accessing Android properties.
|
||||
pub fn new() -> Self {
|
||||
AndroidSystemProperties {
|
||||
libc_so: std::ptr::null_mut(),
|
||||
find_fn: None,
|
||||
read_callback_fn: None,
|
||||
get_fn: None,
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(target_os = "android")]
|
||||
/// Create an entry point for accessing Android properties.
|
||||
pub fn new() -> Self {
|
||||
let libc_so = unsafe { libc::dlopen(b"libc.so\0".as_ptr().cast(), libc::RTLD_NOLOAD) };
|
||||
|
||||
let mut properties = AndroidSystemProperties {
|
||||
libc_so,
|
||||
find_fn: None,
|
||||
read_callback_fn: None,
|
||||
get_fn: None,
|
||||
};
|
||||
|
||||
if libc_so.is_null() {
|
||||
return properties;
|
||||
}
|
||||
|
||||
|
||||
unsafe fn load_fn(libc_so: *mut c_void, name: &[u8]) -> Option<*const c_void> {
|
||||
let fn_ptr = libc::dlsym(libc_so, name.as_ptr().cast());
|
||||
|
||||
if fn_ptr.is_null() {
|
||||
return None;
|
||||
}
|
||||
|
||||
Some(fn_ptr)
|
||||
}
|
||||
|
||||
unsafe {
|
||||
properties.read_callback_fn = load_fn(libc_so, b"__system_property_read_callback\0")
|
||||
.map(|raw| mem::transmute::<*const c_void, SystemPropertyReadCallbackFn>(raw));
|
||||
|
||||
properties.find_fn = load_fn(libc_so, b"__system_property_find\0")
|
||||
.map(|raw| mem::transmute::<*const c_void, SystemPropertyFindFn>(raw));
|
||||
|
||||
// Fallback for old versions of Android.
|
||||
if properties.read_callback_fn.is_none() || properties.find_fn.is_none() {
|
||||
properties.get_fn = load_fn(libc_so, b"__system_property_get\0")
|
||||
.map(|raw| mem::transmute::<*const c_void, SystemPropertyGetFn>(raw));
|
||||
}
|
||||
}
|
||||
|
||||
properties
|
||||
}
|
||||
|
||||
/// Retrieve a system property.
|
||||
///
|
||||
/// Returns None if the operation fails.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```
|
||||
/// # use android_system_properties::AndroidSystemProperties;
|
||||
/// let properties = AndroidSystemProperties::new();
|
||||
///
|
||||
/// if let Some(value) = properties.get("persist.sys.timezone") {
|
||||
/// println!("{}", value);
|
||||
/// }
|
||||
/// ```
|
||||
pub fn get(&self, name: &str) -> Option<String> {
|
||||
let cname = CString::new(name).ok()?;
|
||||
self.get_from_cstr(&cname)
|
||||
}
|
||||
|
||||
/// Retrieve a system property using a [`CStr`] key.
|
||||
///
|
||||
/// Returns None if the operation fails.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```
|
||||
/// # use android_system_properties::AndroidSystemProperties;
|
||||
/// # use std::ffi::CStr;
|
||||
/// let properties = AndroidSystemProperties::new();
|
||||
///
|
||||
/// let key = unsafe { CStr::from_bytes_with_nul_unchecked(b"persist.sys.timezone\0") };
|
||||
/// if let Some(value) = properties.get_from_cstr(key) {
|
||||
/// println!("{}", value);
|
||||
/// }
|
||||
/// ```
|
||||
pub fn get_from_cstr(&self, cname: &std::ffi::CStr) -> Option<String> {
|
||||
// If available, use the recommended approach to accessing properties (Android L and onward).
|
||||
if let (Some(find_fn), Some(read_callback_fn)) = (self.find_fn, self.read_callback_fn) {
|
||||
let info = unsafe { (find_fn)(cname.as_ptr()) };
|
||||
|
||||
if info.is_null() {
|
||||
return None;
|
||||
}
|
||||
|
||||
let mut result = String::new();
|
||||
|
||||
unsafe {
|
||||
(read_callback_fn)(info, property_callback, &mut result);
|
||||
}
|
||||
|
||||
return Some(result);
|
||||
}
|
||||
|
||||
// Fall back to the older approach.
|
||||
if let Some(get_fn) = self.get_fn {
|
||||
// The constant is PROP_VALUE_MAX in Android's libc/include/sys/system_properties.h
|
||||
const PROPERTY_VALUE_MAX: usize = 92;
|
||||
let mut buffer: Vec<u8> = Vec::with_capacity(PROPERTY_VALUE_MAX);
|
||||
let raw = buffer.as_mut_ptr() as *mut c_char;
|
||||
|
||||
let len = unsafe { (get_fn)(cname.as_ptr(), raw) };
|
||||
|
||||
if len > 0 {
|
||||
assert!(len as usize <= buffer.capacity());
|
||||
unsafe { buffer.set_len(len as usize); }
|
||||
String::from_utf8(buffer).ok()
|
||||
} else {
|
||||
None
|
||||
}
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for AndroidSystemProperties {
|
||||
fn drop(&mut self) {
|
||||
if !self.libc_so.is_null() {
|
||||
unsafe {
|
||||
libc::dlclose(self.libc_so);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
1
vendor/anstream/.cargo-checksum.json
vendored
Normal file
1
vendor/anstream/.cargo-checksum.json
vendored
Normal file
@ -0,0 +1 @@
|
||||
{"files":{"Cargo.lock":"23d8ed34328e75b81cc00af942f61b1bc05434435e92fca8a2d649abfc1b8f49","Cargo.toml":"ceca3cf1fc87f6ec6c2c687410fc2e60ed31bf58c39c54e8237603abc423b246","LICENSE-APACHE":"c6596eb7be8581c18be736c846fb9173b69eccf6ef94c5135893ec56bd92ba08","LICENSE-MIT":"6efb0476a1cc085077ed49357026d8c173bf33017278ef440f222fb9cbcb66e6","README.md":"b230c2257d0c7a49b9bd97f2fa73abedcdc055757b5cedd2b0eb1a7a448ff461","benches/stream.rs":"7e666c4f4b79ddb5237361ed25264a966ee241192fbb2c1baea3006e3e0326b4","benches/strip.rs":"9603bd5ca1ae4661c2ccab50315dbfdec0c661ac2624262172bbd8f5d0bd87c9","benches/wincon.rs":"680e86933c008b242a3286c5149c33d3c086426eb99fe134b6e79f7578f96663","examples/dump-stream.rs":"9c5791bd739c3a74cfc24da90a5f96ee448b71ecf9800d3934028c5d3deb28e6","examples/query-stream.rs":"16f38843083174fbefa974a5aa38a5f3ffa51bd6e6db3dc1d91164462219399e","src/adapter/mod.rs":"baf4237ea0b18df63609e49d93572ca27c2202a4cbec0220adb5a7e815c7d8ed","src/adapter/strip.rs":"b324562426cb7ad8bceeeb8ea012746b5a046f901ea878d6de8d61f96ec96a55","src/adapter/wincon.rs":"96ce7d753abb4d6ed42c044545a4f557455bb825432904d6316c0aa245eb0085","src/auto.rs":"aa7f0988fc1c3f8c0d5bf1ff12e108cc3eb29d330f28da02cb4a2e09ec9fcc7c","src/buffer.rs":"83e7088b50dd3e2941c06a417d9eef75fda45311a2912ba94f480ec98d6f0183","src/fmt.rs":"cc11b005c4559843bd908a57958a13c8d0922fae6aff5261f3583c90e60da73c","src/lib.rs":"7502dcd2be531d787eac7efde761c95c892a425b55a53d6d99ef5d99d912a5f0","src/macros.rs":"a26ababe32a39732d0aade9674f6e5e267bd26c6ea06603ff9e61e80681195e0","src/stream.rs":"cbe8f61fba4c3c60934339c8bda5d1ff43320f57cdc4ed409aa173945a941b3d","src/strip.rs":"09c8bcd5bda0b07b56929026d965222d8129908f8386350b87314bc5fefcc2fe","src/wincon.rs":"e85c03ccfeca352a32572db8bb6c903f78c2003f5b375254edc5a69d6843728f"},"package":"d96bd03f33fe50a863e394ee9718a706f988b9079b20c3784fb726e7678b62fb"}
|
1094
vendor/anstream/Cargo.lock
generated
vendored
Normal file
1094
vendor/anstream/Cargo.lock
generated
vendored
Normal file
File diff suppressed because it is too large
Load Diff
140
vendor/anstream/Cargo.toml
vendored
Normal file
140
vendor/anstream/Cargo.toml
vendored
Normal file
@ -0,0 +1,140 @@
|
||||
# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO
|
||||
#
|
||||
# When uploading crates to the registry Cargo will automatically
|
||||
# "normalize" Cargo.toml files for maximal compatibility
|
||||
# with all versions of Cargo and also rewrite `path` dependencies
|
||||
# to registry (e.g., crates.io) dependencies.
|
||||
#
|
||||
# If you are reading this file be aware that the original Cargo.toml
|
||||
# will likely look very different (and much more reasonable).
|
||||
# See Cargo.toml.orig for the original contents.
|
||||
|
||||
[package]
|
||||
edition = "2021"
|
||||
rust-version = "1.70.0"
|
||||
name = "anstream"
|
||||
version = "0.6.13"
|
||||
include = [
|
||||
"build.rs",
|
||||
"src/**/*",
|
||||
"Cargo.toml",
|
||||
"Cargo.lock",
|
||||
"LICENSE*",
|
||||
"README.md",
|
||||
"benches/**/*",
|
||||
"examples/**/*",
|
||||
]
|
||||
description = "A simple cross platform library for writing colored text to a terminal."
|
||||
homepage = "https://github.com/rust-cli/anstyle"
|
||||
readme = "README.md"
|
||||
keywords = [
|
||||
"ansi",
|
||||
"terminal",
|
||||
"color",
|
||||
"strip",
|
||||
"wincon",
|
||||
]
|
||||
categories = ["command-line-interface"]
|
||||
license = "MIT OR Apache-2.0"
|
||||
repository = "https://github.com/rust-cli/anstyle.git"
|
||||
|
||||
[package.metadata.docs.rs]
|
||||
cargo-args = [
|
||||
"-Zunstable-options",
|
||||
"-Zrustdoc-scrape-examples",
|
||||
]
|
||||
rustdoc-args = [
|
||||
"--cfg",
|
||||
"docsrs",
|
||||
]
|
||||
|
||||
[[package.metadata.release.pre-release-replacements]]
|
||||
file = "CHANGELOG.md"
|
||||
min = 1
|
||||
replace = "{{version}}"
|
||||
search = "Unreleased"
|
||||
|
||||
[[package.metadata.release.pre-release-replacements]]
|
||||
exactly = 1
|
||||
file = "CHANGELOG.md"
|
||||
replace = "...{{tag_name}}"
|
||||
search = '\.\.\.HEAD'
|
||||
|
||||
[[package.metadata.release.pre-release-replacements]]
|
||||
file = "CHANGELOG.md"
|
||||
min = 1
|
||||
replace = "{{date}}"
|
||||
search = "ReleaseDate"
|
||||
|
||||
[[package.metadata.release.pre-release-replacements]]
|
||||
exactly = 1
|
||||
file = "CHANGELOG.md"
|
||||
replace = """
|
||||
<!-- next-header -->
|
||||
## [Unreleased] - ReleaseDate
|
||||
"""
|
||||
search = "<!-- next-header -->"
|
||||
|
||||
[[package.metadata.release.pre-release-replacements]]
|
||||
exactly = 1
|
||||
file = "CHANGELOG.md"
|
||||
replace = """
|
||||
<!-- next-url -->
|
||||
[Unreleased]: https://github.com/rust-cli/anstyle/compare/{{tag_name}}...HEAD"""
|
||||
search = "<!-- next-url -->"
|
||||
|
||||
[[bench]]
|
||||
name = "strip"
|
||||
harness = false
|
||||
|
||||
[[bench]]
|
||||
name = "wincon"
|
||||
harness = false
|
||||
|
||||
[[bench]]
|
||||
name = "stream"
|
||||
harness = false
|
||||
|
||||
[dependencies.anstyle]
|
||||
version = "1.0.0"
|
||||
|
||||
[dependencies.anstyle-parse]
|
||||
version = "0.2.0"
|
||||
|
||||
[dependencies.anstyle-query]
|
||||
version = "1.0.0"
|
||||
optional = true
|
||||
|
||||
[dependencies.colorchoice]
|
||||
version = "1.0.0"
|
||||
|
||||
[dependencies.utf8parse]
|
||||
version = "0.2.1"
|
||||
|
||||
[dev-dependencies.criterion]
|
||||
version = "0.5.1"
|
||||
|
||||
[dev-dependencies.lexopt]
|
||||
version = "0.3.0"
|
||||
|
||||
[dev-dependencies.owo-colors]
|
||||
version = "4.0.0"
|
||||
|
||||
[dev-dependencies.proptest]
|
||||
version = "1.4.0"
|
||||
|
||||
[dev-dependencies.strip-ansi-escapes]
|
||||
version = "0.2.0"
|
||||
|
||||
[features]
|
||||
auto = ["dep:anstyle-query"]
|
||||
default = [
|
||||
"auto",
|
||||
"wincon",
|
||||
]
|
||||
test = []
|
||||
wincon = ["dep:anstyle-wincon"]
|
||||
|
||||
[target."cfg(windows)".dependencies.anstyle-wincon]
|
||||
version = "3.0.1"
|
||||
optional = true
|
202
vendor/anstream/LICENSE-APACHE
vendored
Normal file
202
vendor/anstream/LICENSE-APACHE
vendored
Normal file
@ -0,0 +1,202 @@
|
||||
Apache License
|
||||
Version 2.0, January 2004
|
||||
http://www.apache.org/licenses/
|
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
1. Definitions.
|
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction,
|
||||
and distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by
|
||||
the copyright owner that is granting the License.
|
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all
|
||||
other entities that control, are controlled by, or are under common
|
||||
control with that entity. For the purposes of this definition,
|
||||
"control" means (i) the power, direct or indirect, to cause the
|
||||
direction or management of such entity, whether by contract or
|
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity
|
||||
exercising permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications,
|
||||
including but not limited to software source code, documentation
|
||||
source, and configuration files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical
|
||||
transformation or translation of a Source form, including but
|
||||
not limited to compiled object code, generated documentation,
|
||||
and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or
|
||||
Object form, made available under the License, as indicated by a
|
||||
copyright notice that is included in or attached to the work
|
||||
(an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object
|
||||
form, that is based on (or derived from) the Work and for which the
|
||||
editorial revisions, annotations, elaborations, or other modifications
|
||||
represent, as a whole, an original work of authorship. For the purposes
|
||||
of this License, Derivative Works shall not include works that remain
|
||||
separable from, or merely link (or bind by name) to the interfaces of,
|
||||
the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including
|
||||
the original version of the Work and any modifications or additions
|
||||
to that Work or Derivative Works thereof, that is intentionally
|
||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||
or by an individual or Legal Entity authorized to submit on behalf of
|
||||
the copyright owner. For the purposes of this definition, "submitted"
|
||||
means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems,
|
||||
and issue tracking systems that are managed by, or on behalf of, the
|
||||
Licensor for the purpose of discussing and improving the Work, but
|
||||
excluding communication that is conspicuously marked or otherwise
|
||||
designated in writing by the copyright owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||
on behalf of whom a Contribution has been received by Licensor and
|
||||
subsequently incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
copyright license to reproduce, prepare Derivative Works of,
|
||||
publicly display, publicly perform, sublicense, and distribute the
|
||||
Work and such Derivative Works in Source or Object form.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
(except as stated in this section) patent license to make, have made,
|
||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||
where such license applies only to those patent claims licensable
|
||||
by such Contributor that are necessarily infringed by their
|
||||
Contribution(s) alone or by combination of their Contribution(s)
|
||||
with the Work to which such Contribution(s) was submitted. If You
|
||||
institute patent litigation against any entity (including a
|
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||
or a Contribution incorporated within the Work constitutes direct
|
||||
or contributory patent infringement, then any patent licenses
|
||||
granted to You under this License for that Work shall terminate
|
||||
as of the date such litigation is filed.
|
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the
|
||||
Work or Derivative Works thereof in any medium, with or without
|
||||
modifications, and in Source or Object form, provided that You
|
||||
meet the following conditions:
|
||||
|
||||
(a) You must give any other recipients of the Work or
|
||||
Derivative Works a copy of this License; and
|
||||
|
||||
(b) You must cause any modified files to carry prominent notices
|
||||
stating that You changed the files; and
|
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works
|
||||
that You distribute, all copyright, patent, trademark, and
|
||||
attribution notices from the Source form of the Work,
|
||||
excluding those notices that do not pertain to any part of
|
||||
the Derivative Works; and
|
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its
|
||||
distribution, then any Derivative Works that You distribute must
|
||||
include a readable copy of the attribution notices contained
|
||||
within such NOTICE file, excluding those notices that do not
|
||||
pertain to any part of the Derivative Works, in at least one
|
||||
of the following places: within a NOTICE text file distributed
|
||||
as part of the Derivative Works; within the Source form or
|
||||
documentation, if provided along with the Derivative Works; or,
|
||||
within a display generated by the Derivative Works, if and
|
||||
wherever such third-party notices normally appear. The contents
|
||||
of the NOTICE file are for informational purposes only and
|
||||
do not modify the License. You may add Your own attribution
|
||||
notices within Derivative Works that You distribute, alongside
|
||||
or as an addendum to the NOTICE text from the Work, provided
|
||||
that such additional attribution notices cannot be construed
|
||||
as modifying the License.
|
||||
|
||||
You may add Your own copyright statement to Your modifications and
|
||||
may provide additional or different license terms and conditions
|
||||
for use, reproduction, or distribution of Your modifications, or
|
||||
for any such Derivative Works as a whole, provided Your use,
|
||||
reproduction, and distribution of the Work otherwise complies with
|
||||
the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||
any Contribution intentionally submitted for inclusion in the Work
|
||||
by You to the Licensor shall be under the terms and conditions of
|
||||
this License, without any additional terms or conditions.
|
||||
Notwithstanding the above, nothing herein shall supersede or modify
|
||||
the terms of any separate license agreement you may have executed
|
||||
with Licensor regarding such Contributions.
|
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade
|
||||
names, trademarks, service marks, or product names of the Licensor,
|
||||
except as required for reasonable and customary use in describing the
|
||||
origin of the Work and reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||
agreed to in writing, Licensor provides the Work (and each
|
||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
implied, including, without limitation, any warranties or conditions
|
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Work and assume any
|
||||
risks associated with Your exercise of permissions under this License.
|
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory,
|
||||
whether in tort (including negligence), contract, or otherwise,
|
||||
unless required by applicable law (such as deliberate and grossly
|
||||
negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special,
|
||||
incidental, or consequential damages of any character arising as a
|
||||
result of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill,
|
||||
work stoppage, computer failure or malfunction, or any and all
|
||||
other commercial damages or losses), even if such Contributor
|
||||
has been advised of the possibility of such damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing
|
||||
the Work or Derivative Works thereof, You may choose to offer,
|
||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||
or other liability obligations and/or rights consistent with this
|
||||
License. However, in accepting such obligations, You may act only
|
||||
on Your own behalf and on Your sole responsibility, not on behalf
|
||||
of any other Contributor, and only if You agree to indemnify,
|
||||
defend, and hold each Contributor harmless for any liability
|
||||
incurred by, or claims asserted against, such Contributor by reason
|
||||
of your accepting any such warranty or additional liability.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
APPENDIX: How to apply the Apache License to your work.
|
||||
|
||||
To apply the Apache License to your work, attach the following
|
||||
boilerplate notice, with the fields enclosed by brackets "{}"
|
||||
replaced with your own identifying information. (Don't include
|
||||
the brackets!) The text should be enclosed in the appropriate
|
||||
comment syntax for the file format. We also recommend that a
|
||||
file or class name and description of purpose be included on the
|
||||
same "printed page" as the copyright notice for easier
|
||||
identification within third-party archives.
|
||||
|
||||
Copyright {yyyy} {name of copyright owner}
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
|
19
vendor/anstream/LICENSE-MIT
vendored
Normal file
19
vendor/anstream/LICENSE-MIT
vendored
Normal file
@ -0,0 +1,19 @@
|
||||
Copyright (c) Individual contributors
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
34
vendor/anstream/README.md
vendored
Normal file
34
vendor/anstream/README.md
vendored
Normal file
@ -0,0 +1,34 @@
|
||||
# anstream
|
||||
|
||||
> A simple cross platform library for writing colored text to a terminal.
|
||||
|
||||
*A portmanteau of "ansi stream"*
|
||||
|
||||
[][Documentation]
|
||||

|
||||
[](https://crates.io/crates/anstream)
|
||||
|
||||
Specialized `stdout` and `stderr` that accept ANSI escape codes and adapt them
|
||||
based on the terminal's capabilities.
|
||||
|
||||
`anstream::adapter::strip_str` may also be of interest on its own for low
|
||||
overhead stripping of ANSI escape codes.
|
||||
|
||||
## License
|
||||
|
||||
Licensed under either of
|
||||
|
||||
* Apache License, Version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
|
||||
* MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
|
||||
|
||||
at your option.
|
||||
|
||||
### Contribution
|
||||
|
||||
Unless you explicitly state otherwise, any contribution intentionally
|
||||
submitted for inclusion in the work by you, as defined in the Apache-2.0
|
||||
license, shall be dual licensed as above, without any additional terms or
|
||||
conditions.
|
||||
|
||||
[Crates.io]: https://crates.io/crates/anstream
|
||||
[Documentation]: https://docs.rs/anstream
|
81
vendor/anstream/benches/stream.rs
vendored
Normal file
81
vendor/anstream/benches/stream.rs
vendored
Normal file
@ -0,0 +1,81 @@
|
||||
use std::io::Write as _;
|
||||
|
||||
use criterion::{black_box, Criterion};
|
||||
|
||||
fn stream(c: &mut Criterion) {
|
||||
for (name, content) in [
|
||||
("demo.vte", &include_bytes!("../tests/demo.vte")[..]),
|
||||
("rg_help.vte", &include_bytes!("../tests/rg_help.vte")[..]),
|
||||
("rg_linus.vte", &include_bytes!("../tests/rg_linus.vte")[..]),
|
||||
(
|
||||
"state_changes",
|
||||
&b"\x1b]2;X\x1b\\ \x1b[0m \x1bP0@\x1b\\"[..],
|
||||
),
|
||||
] {
|
||||
let mut group = c.benchmark_group(name);
|
||||
group.bench_function("nop", |b| {
|
||||
b.iter(|| {
|
||||
let buffer = Vec::with_capacity(content.len());
|
||||
let mut stream = buffer;
|
||||
|
||||
stream.write_all(content).unwrap();
|
||||
|
||||
black_box(stream)
|
||||
})
|
||||
});
|
||||
group.bench_function("StripStream", |b| {
|
||||
b.iter(|| {
|
||||
let buffer = Vec::with_capacity(content.len());
|
||||
let mut stream = anstream::StripStream::new(buffer);
|
||||
|
||||
stream.write_all(content).unwrap();
|
||||
|
||||
black_box(stream)
|
||||
})
|
||||
});
|
||||
#[cfg(all(windows, feature = "wincon"))]
|
||||
group.bench_function("WinconStream", |b| {
|
||||
b.iter(|| {
|
||||
let buffer = Vec::with_capacity(content.len());
|
||||
let mut stream = anstream::WinconStream::new(buffer);
|
||||
|
||||
stream.write_all(content).unwrap();
|
||||
|
||||
black_box(stream)
|
||||
})
|
||||
});
|
||||
group.bench_function("AutoStream::always_ansi", |b| {
|
||||
b.iter(|| {
|
||||
let buffer = Vec::with_capacity(content.len());
|
||||
let mut stream = anstream::AutoStream::always_ansi(buffer);
|
||||
|
||||
stream.write_all(content).unwrap();
|
||||
|
||||
black_box(stream)
|
||||
})
|
||||
});
|
||||
group.bench_function("AutoStream::always", |b| {
|
||||
b.iter(|| {
|
||||
let buffer = Vec::with_capacity(content.len());
|
||||
let mut stream = anstream::AutoStream::always(buffer);
|
||||
|
||||
stream.write_all(content).unwrap();
|
||||
|
||||
black_box(stream)
|
||||
})
|
||||
});
|
||||
group.bench_function("AutoStream::never", |b| {
|
||||
b.iter(|| {
|
||||
let buffer = Vec::with_capacity(content.len());
|
||||
let mut stream = anstream::AutoStream::never(buffer);
|
||||
|
||||
stream.write_all(content).unwrap();
|
||||
|
||||
black_box(stream)
|
||||
})
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
criterion::criterion_group!(benches, stream);
|
||||
criterion::criterion_main!(benches);
|
102
vendor/anstream/benches/strip.rs
vendored
Normal file
102
vendor/anstream/benches/strip.rs
vendored
Normal file
@ -0,0 +1,102 @@
|
||||
use criterion::{black_box, Criterion};
|
||||
|
||||
#[derive(Default)]
|
||||
struct Strip(String);
|
||||
impl Strip {
|
||||
fn with_capacity(capacity: usize) -> Self {
|
||||
Self(String::with_capacity(capacity))
|
||||
}
|
||||
}
|
||||
impl anstyle_parse::Perform for Strip {
|
||||
fn print(&mut self, c: char) {
|
||||
self.0.push(c);
|
||||
}
|
||||
|
||||
fn execute(&mut self, byte: u8) {
|
||||
if byte.is_ascii_whitespace() {
|
||||
self.0.push(byte as char);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn strip(c: &mut Criterion) {
|
||||
for (name, content) in [
|
||||
("demo.vte", &include_bytes!("../tests/demo.vte")[..]),
|
||||
("rg_help.vte", &include_bytes!("../tests/rg_help.vte")[..]),
|
||||
("rg_linus.vte", &include_bytes!("../tests/rg_linus.vte")[..]),
|
||||
(
|
||||
"state_changes",
|
||||
&b"\x1b]2;X\x1b\\ \x1b[0m \x1bP0@\x1b\\"[..],
|
||||
),
|
||||
] {
|
||||
// Make sure the comparison is fair
|
||||
if let Ok(content) = std::str::from_utf8(content) {
|
||||
let mut stripped = Strip::with_capacity(content.len());
|
||||
let mut parser = anstyle_parse::Parser::<anstyle_parse::DefaultCharAccumulator>::new();
|
||||
for byte in content.as_bytes() {
|
||||
parser.advance(&mut stripped, *byte);
|
||||
}
|
||||
assert_eq!(
|
||||
stripped.0,
|
||||
anstream::adapter::strip_str(content).to_string()
|
||||
);
|
||||
assert_eq!(
|
||||
stripped.0,
|
||||
String::from_utf8(anstream::adapter::strip_bytes(content.as_bytes()).into_vec())
|
||||
.unwrap()
|
||||
);
|
||||
}
|
||||
|
||||
let mut group = c.benchmark_group(name);
|
||||
group.bench_function("advance_strip", |b| {
|
||||
b.iter(|| {
|
||||
let mut stripped = Strip::with_capacity(content.len());
|
||||
let mut parser =
|
||||
anstyle_parse::Parser::<anstyle_parse::DefaultCharAccumulator>::new();
|
||||
|
||||
for byte in content {
|
||||
parser.advance(&mut stripped, *byte);
|
||||
}
|
||||
|
||||
black_box(stripped.0)
|
||||
})
|
||||
});
|
||||
group.bench_function("strip_ansi_escapes", |b| {
|
||||
b.iter(|| {
|
||||
let stripped = strip_ansi_escapes::strip(content);
|
||||
|
||||
black_box(stripped)
|
||||
})
|
||||
});
|
||||
if let Ok(content) = std::str::from_utf8(content) {
|
||||
group.bench_function("strip_str", |b| {
|
||||
b.iter(|| {
|
||||
let stripped = anstream::adapter::strip_str(content).to_string();
|
||||
|
||||
black_box(stripped)
|
||||
})
|
||||
});
|
||||
group.bench_function("StripStr", |b| {
|
||||
b.iter(|| {
|
||||
let mut stripped = String::with_capacity(content.len());
|
||||
let mut state = anstream::adapter::StripStr::new();
|
||||
for printable in state.strip_next(content) {
|
||||
stripped.push_str(printable);
|
||||
}
|
||||
|
||||
black_box(stripped)
|
||||
})
|
||||
});
|
||||
}
|
||||
group.bench_function("strip_bytes", |b| {
|
||||
b.iter(|| {
|
||||
let stripped = anstream::adapter::strip_bytes(content).into_vec();
|
||||
|
||||
black_box(stripped)
|
||||
})
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
criterion::criterion_group!(benches, strip);
|
||||
criterion::criterion_main!(benches);
|
26
vendor/anstream/benches/wincon.rs
vendored
Normal file
26
vendor/anstream/benches/wincon.rs
vendored
Normal file
@ -0,0 +1,26 @@
|
||||
use criterion::{black_box, Criterion};
|
||||
|
||||
fn wincon(c: &mut Criterion) {
|
||||
for (name, content) in [
|
||||
("demo.vte", &include_bytes!("../tests/demo.vte")[..]),
|
||||
("rg_help.vte", &include_bytes!("../tests/rg_help.vte")[..]),
|
||||
("rg_linus.vte", &include_bytes!("../tests/rg_linus.vte")[..]),
|
||||
(
|
||||
"state_changes",
|
||||
&b"\x1b]2;X\x1b\\ \x1b[0m \x1bP0@\x1b\\"[..],
|
||||
),
|
||||
] {
|
||||
let mut group = c.benchmark_group(name);
|
||||
group.bench_function("wincon_bytes", |b| {
|
||||
b.iter(|| {
|
||||
let mut state = anstream::adapter::WinconBytes::new();
|
||||
let stripped = state.extract_next(content).collect::<Vec<_>>();
|
||||
|
||||
black_box(stripped)
|
||||
})
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
criterion::criterion_group!(benches, wincon);
|
||||
criterion::criterion_main!(benches);
|
126
vendor/anstream/examples/dump-stream.rs
vendored
Normal file
126
vendor/anstream/examples/dump-stream.rs
vendored
Normal file
@ -0,0 +1,126 @@
|
||||
use std::io::Write;
|
||||
|
||||
fn main() -> Result<(), lexopt::Error> {
|
||||
let args = Args::parse()?;
|
||||
let stdout = anstream::stdout();
|
||||
let mut stdout = stdout.lock();
|
||||
|
||||
for fixed in 0..16 {
|
||||
let color = anstyle::Ansi256Color(fixed).into_ansi().unwrap();
|
||||
let style = style(color, args.layer, args.effects);
|
||||
let _ = print_number(&mut stdout, fixed, style);
|
||||
if fixed == 7 || fixed == 15 {
|
||||
let _ = writeln!(&mut stdout);
|
||||
}
|
||||
}
|
||||
|
||||
for fixed in 16..232 {
|
||||
let col = (fixed - 16) % 36;
|
||||
if col == 0 {
|
||||
let _ = writeln!(stdout);
|
||||
}
|
||||
let color = anstyle::Ansi256Color(fixed);
|
||||
let style = style(color, args.layer, args.effects);
|
||||
let _ = print_number(&mut stdout, fixed, style);
|
||||
}
|
||||
|
||||
let _ = writeln!(stdout);
|
||||
let _ = writeln!(stdout);
|
||||
for fixed in 232..=255 {
|
||||
let color = anstyle::Ansi256Color(fixed);
|
||||
let style = style(color, args.layer, args.effects);
|
||||
let _ = print_number(&mut stdout, fixed, style);
|
||||
}
|
||||
|
||||
let _ = writeln!(stdout);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn style(
|
||||
color: impl Into<anstyle::Color>,
|
||||
layer: Layer,
|
||||
effects: anstyle::Effects,
|
||||
) -> anstyle::Style {
|
||||
let color = color.into();
|
||||
(match layer {
|
||||
Layer::Fg => anstyle::Style::new().fg_color(Some(color)),
|
||||
Layer::Bg => anstyle::Style::new().bg_color(Some(color)),
|
||||
Layer::Underline => anstyle::Style::new().underline_color(Some(color)),
|
||||
}) | effects
|
||||
}
|
||||
|
||||
fn print_number(stdout: &mut impl Write, fixed: u8, style: anstyle::Style) -> std::io::Result<()> {
|
||||
write!(stdout, "{style}{fixed:>3X}{style:#}",)
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
struct Args {
|
||||
effects: anstyle::Effects,
|
||||
layer: Layer,
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, Default)]
|
||||
enum Layer {
|
||||
#[default]
|
||||
Fg,
|
||||
Bg,
|
||||
Underline,
|
||||
}
|
||||
|
||||
impl Args {
|
||||
fn parse() -> Result<Self, lexopt::Error> {
|
||||
use lexopt::prelude::*;
|
||||
|
||||
let mut res = Args::default();
|
||||
|
||||
let mut args = lexopt::Parser::from_env();
|
||||
while let Some(arg) = args.next()? {
|
||||
match arg {
|
||||
Long("layer") => {
|
||||
res.layer = args.value()?.parse_with(|s| match s {
|
||||
"fg" => Ok(Layer::Fg),
|
||||
"bg" => Ok(Layer::Bg),
|
||||
"underline" => Ok(Layer::Underline),
|
||||
_ => Err("expected values fg, bg, underline"),
|
||||
})?;
|
||||
}
|
||||
Long("effect") => {
|
||||
const EFFECTS: [(&str, anstyle::Effects); 12] = [
|
||||
("bold", anstyle::Effects::BOLD),
|
||||
("dimmed", anstyle::Effects::DIMMED),
|
||||
("italic", anstyle::Effects::ITALIC),
|
||||
("underline", anstyle::Effects::UNDERLINE),
|
||||
("double_underline", anstyle::Effects::DOUBLE_UNDERLINE),
|
||||
("curly_underline", anstyle::Effects::CURLY_UNDERLINE),
|
||||
("dotted_underline", anstyle::Effects::DOTTED_UNDERLINE),
|
||||
("dashed_underline", anstyle::Effects::DASHED_UNDERLINE),
|
||||
("blink", anstyle::Effects::BLINK),
|
||||
("invert", anstyle::Effects::INVERT),
|
||||
("hidden", anstyle::Effects::HIDDEN),
|
||||
("strikethrough", anstyle::Effects::STRIKETHROUGH),
|
||||
];
|
||||
let effect = args.value()?.parse_with(|s| {
|
||||
EFFECTS
|
||||
.into_iter()
|
||||
.find(|(name, _)| *name == s)
|
||||
.map(|(_, effect)| effect)
|
||||
.ok_or_else(|| {
|
||||
format!(
|
||||
"expected one of {}",
|
||||
EFFECTS
|
||||
.into_iter()
|
||||
.map(|(n, _)| n)
|
||||
.collect::<Vec<_>>()
|
||||
.join(", ")
|
||||
)
|
||||
})
|
||||
})?;
|
||||
res.effects = res.effects.insert(effect);
|
||||
}
|
||||
_ => return Err(arg.unexpected()),
|
||||
}
|
||||
}
|
||||
Ok(res)
|
||||
}
|
||||
}
|
20
vendor/anstream/examples/query-stream.rs
vendored
Normal file
20
vendor/anstream/examples/query-stream.rs
vendored
Normal file
@ -0,0 +1,20 @@
|
||||
fn main() {
|
||||
println!("stdout:");
|
||||
println!(
|
||||
" choice: {:?}",
|
||||
anstream::AutoStream::choice(&std::io::stdout())
|
||||
);
|
||||
println!(
|
||||
" choice: {:?}",
|
||||
anstream::AutoStream::auto(std::io::stdout()).current_choice()
|
||||
);
|
||||
println!("stderr:");
|
||||
println!(
|
||||
" choice: {:?}",
|
||||
anstream::AutoStream::choice(&std::io::stderr())
|
||||
);
|
||||
println!(
|
||||
" choice: {:?}",
|
||||
anstream::AutoStream::auto(std::io::stderr()).current_choice()
|
||||
);
|
||||
}
|
15
vendor/anstream/src/adapter/mod.rs
vendored
Normal file
15
vendor/anstream/src/adapter/mod.rs
vendored
Normal file
@ -0,0 +1,15 @@
|
||||
//! Gracefully degrade styled output
|
||||
|
||||
mod strip;
|
||||
mod wincon;
|
||||
|
||||
pub use strip::strip_bytes;
|
||||
pub use strip::strip_str;
|
||||
pub use strip::StripBytes;
|
||||
pub use strip::StripBytesIter;
|
||||
pub use strip::StripStr;
|
||||
pub use strip::StripStrIter;
|
||||
pub use strip::StrippedBytes;
|
||||
pub use strip::StrippedStr;
|
||||
pub use wincon::WinconBytes;
|
||||
pub use wincon::WinconBytesIter;
|
509
vendor/anstream/src/adapter/strip.rs
vendored
Normal file
509
vendor/anstream/src/adapter/strip.rs
vendored
Normal file
@ -0,0 +1,509 @@
|
||||
use anstyle_parse::state::state_change;
|
||||
use anstyle_parse::state::Action;
|
||||
use anstyle_parse::state::State;
|
||||
|
||||
/// Strip ANSI escapes from a `&str`, returning the printable content
|
||||
///
|
||||
/// This can be used to take output from a program that includes escape sequences and write it
|
||||
/// somewhere that does not easily support them, such as a log file.
|
||||
///
|
||||
/// For non-contiguous data, see [`StripStr`].
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```rust
|
||||
/// use std::io::Write as _;
|
||||
///
|
||||
/// let styled_text = "\x1b[32mfoo\x1b[m bar";
|
||||
/// let plain_str = anstream::adapter::strip_str(&styled_text).to_string();
|
||||
/// assert_eq!(plain_str, "foo bar");
|
||||
/// ```
|
||||
#[inline]
|
||||
pub fn strip_str(data: &str) -> StrippedStr<'_> {
|
||||
StrippedStr::new(data)
|
||||
}
|
||||
|
||||
/// See [`strip_str`]
|
||||
#[derive(Default, Clone, Debug, PartialEq, Eq)]
|
||||
pub struct StrippedStr<'s> {
|
||||
bytes: &'s [u8],
|
||||
state: State,
|
||||
}
|
||||
|
||||
impl<'s> StrippedStr<'s> {
|
||||
#[inline]
|
||||
fn new(data: &'s str) -> Self {
|
||||
Self {
|
||||
bytes: data.as_bytes(),
|
||||
state: State::Ground,
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a [`String`] of the printable content
|
||||
#[inline]
|
||||
#[allow(clippy::inherent_to_string_shadow_display)] // Single-allocation implementation
|
||||
pub fn to_string(&self) -> String {
|
||||
use std::fmt::Write as _;
|
||||
let mut stripped = String::with_capacity(self.bytes.len());
|
||||
let _ = write!(&mut stripped, "{}", self);
|
||||
stripped
|
||||
}
|
||||
}
|
||||
|
||||
impl<'s> std::fmt::Display for StrippedStr<'s> {
|
||||
/// **Note:** this does *not* exhaust the [`Iterator`]
|
||||
#[inline]
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
let iter = Self {
|
||||
bytes: self.bytes,
|
||||
state: self.state,
|
||||
};
|
||||
for printable in iter {
|
||||
printable.fmt(f)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl<'s> Iterator for StrippedStr<'s> {
|
||||
type Item = &'s str;
|
||||
|
||||
#[inline]
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
next_str(&mut self.bytes, &mut self.state)
|
||||
}
|
||||
}
|
||||
|
||||
/// Incrementally strip non-contiguous data
|
||||
#[derive(Default, Clone, Debug, PartialEq, Eq)]
|
||||
pub struct StripStr {
|
||||
state: State,
|
||||
}
|
||||
|
||||
impl StripStr {
|
||||
/// Initial state
|
||||
pub fn new() -> Self {
|
||||
Default::default()
|
||||
}
|
||||
|
||||
/// Strip the next segment of data
|
||||
pub fn strip_next<'s>(&'s mut self, data: &'s str) -> StripStrIter<'s> {
|
||||
StripStrIter {
|
||||
bytes: data.as_bytes(),
|
||||
state: &mut self.state,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// See [`StripStr`]
|
||||
#[derive(Debug, PartialEq, Eq)]
|
||||
pub struct StripStrIter<'s> {
|
||||
bytes: &'s [u8],
|
||||
state: &'s mut State,
|
||||
}
|
||||
|
||||
impl<'s> Iterator for StripStrIter<'s> {
|
||||
type Item = &'s str;
|
||||
|
||||
#[inline]
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
next_str(&mut self.bytes, self.state)
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn next_str<'s>(bytes: &mut &'s [u8], state: &mut State) -> Option<&'s str> {
|
||||
let offset = bytes.iter().copied().position(|b| {
|
||||
let (next_state, action) = state_change(*state, b);
|
||||
if next_state != State::Anywhere {
|
||||
*state = next_state;
|
||||
}
|
||||
is_printable_bytes(action, b)
|
||||
});
|
||||
let (_, next) = bytes.split_at(offset.unwrap_or(bytes.len()));
|
||||
*bytes = next;
|
||||
*state = State::Ground;
|
||||
|
||||
let offset = bytes.iter().copied().position(|b| {
|
||||
let (_next_state, action) = state_change(State::Ground, b);
|
||||
!(is_printable_bytes(action, b) || is_utf8_continuation(b))
|
||||
});
|
||||
let (printable, next) = bytes.split_at(offset.unwrap_or(bytes.len()));
|
||||
*bytes = next;
|
||||
if printable.is_empty() {
|
||||
None
|
||||
} else {
|
||||
let printable = unsafe {
|
||||
from_utf8_unchecked(
|
||||
printable,
|
||||
"`bytes` was validated as UTF-8, the parser preserves UTF-8 continuations",
|
||||
)
|
||||
};
|
||||
Some(printable)
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
unsafe fn from_utf8_unchecked<'b>(bytes: &'b [u8], safety_justification: &'static str) -> &'b str {
|
||||
if cfg!(debug_assertions) {
|
||||
// Catch problems more quickly when testing
|
||||
std::str::from_utf8(bytes).expect(safety_justification)
|
||||
} else {
|
||||
std::str::from_utf8_unchecked(bytes)
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn is_utf8_continuation(b: u8) -> bool {
|
||||
matches!(b, 0x80..=0xbf)
|
||||
}
|
||||
|
||||
/// Strip ANSI escapes from bytes, returning the printable content
|
||||
///
|
||||
/// This can be used to take output from a program that includes escape sequences and write it
|
||||
/// somewhere that does not easily support them, such as a log file.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```rust
|
||||
/// use std::io::Write as _;
|
||||
///
|
||||
/// let styled_text = "\x1b[32mfoo\x1b[m bar";
|
||||
/// let plain_str = anstream::adapter::strip_bytes(styled_text.as_bytes()).into_vec();
|
||||
/// assert_eq!(plain_str.as_slice(), &b"foo bar"[..]);
|
||||
/// ```
|
||||
#[inline]
|
||||
pub fn strip_bytes(data: &[u8]) -> StrippedBytes<'_> {
|
||||
StrippedBytes::new(data)
|
||||
}
|
||||
|
||||
/// See [`strip_bytes`]
|
||||
#[derive(Default, Clone, Debug, PartialEq, Eq)]
|
||||
pub struct StrippedBytes<'s> {
|
||||
bytes: &'s [u8],
|
||||
state: State,
|
||||
utf8parser: Utf8Parser,
|
||||
}
|
||||
|
||||
impl<'s> StrippedBytes<'s> {
|
||||
/// See [`strip_bytes`]
|
||||
#[inline]
|
||||
pub fn new(bytes: &'s [u8]) -> Self {
|
||||
Self {
|
||||
bytes,
|
||||
state: State::Ground,
|
||||
utf8parser: Default::default(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Strip the next slice of bytes
|
||||
///
|
||||
/// Used when the content is in several non-contiguous slices
|
||||
///
|
||||
/// # Panic
|
||||
///
|
||||
/// May panic if it is not exhausted / empty
|
||||
#[inline]
|
||||
pub fn extend(&mut self, bytes: &'s [u8]) {
|
||||
debug_assert!(
|
||||
self.is_empty(),
|
||||
"current bytes must be processed to ensure we end at the right state"
|
||||
);
|
||||
self.bytes = bytes;
|
||||
}
|
||||
|
||||
/// Report the bytes has been exhausted
|
||||
#[inline]
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.bytes.is_empty()
|
||||
}
|
||||
|
||||
/// Create a [`Vec`] of the printable content
|
||||
#[inline]
|
||||
pub fn into_vec(self) -> Vec<u8> {
|
||||
let mut stripped = Vec::with_capacity(self.bytes.len());
|
||||
for printable in self {
|
||||
stripped.extend(printable);
|
||||
}
|
||||
stripped
|
||||
}
|
||||
}
|
||||
|
||||
impl<'s> Iterator for StrippedBytes<'s> {
|
||||
type Item = &'s [u8];
|
||||
|
||||
#[inline]
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
next_bytes(&mut self.bytes, &mut self.state, &mut self.utf8parser)
|
||||
}
|
||||
}
|
||||
|
||||
/// Incrementally strip non-contiguous data
|
||||
#[derive(Default, Clone, Debug, PartialEq, Eq)]
|
||||
pub struct StripBytes {
|
||||
state: State,
|
||||
utf8parser: Utf8Parser,
|
||||
}
|
||||
|
||||
impl StripBytes {
|
||||
/// Initial state
|
||||
pub fn new() -> Self {
|
||||
Default::default()
|
||||
}
|
||||
|
||||
/// Strip the next segment of data
|
||||
pub fn strip_next<'s>(&'s mut self, bytes: &'s [u8]) -> StripBytesIter<'s> {
|
||||
StripBytesIter {
|
||||
bytes,
|
||||
state: &mut self.state,
|
||||
utf8parser: &mut self.utf8parser,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// See [`StripBytes`]
|
||||
#[derive(Debug, PartialEq, Eq)]
|
||||
pub struct StripBytesIter<'s> {
|
||||
bytes: &'s [u8],
|
||||
state: &'s mut State,
|
||||
utf8parser: &'s mut Utf8Parser,
|
||||
}
|
||||
|
||||
impl<'s> Iterator for StripBytesIter<'s> {
|
||||
type Item = &'s [u8];
|
||||
|
||||
#[inline]
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
next_bytes(&mut self.bytes, self.state, self.utf8parser)
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn next_bytes<'s>(
|
||||
bytes: &mut &'s [u8],
|
||||
state: &mut State,
|
||||
utf8parser: &mut Utf8Parser,
|
||||
) -> Option<&'s [u8]> {
|
||||
let offset = bytes.iter().copied().position(|b| {
|
||||
if *state == State::Utf8 {
|
||||
true
|
||||
} else {
|
||||
let (next_state, action) = state_change(*state, b);
|
||||
if next_state != State::Anywhere {
|
||||
*state = next_state;
|
||||
}
|
||||
is_printable_bytes(action, b)
|
||||
}
|
||||
});
|
||||
let (_, next) = bytes.split_at(offset.unwrap_or(bytes.len()));
|
||||
*bytes = next;
|
||||
|
||||
let offset = bytes.iter().copied().position(|b| {
|
||||
if *state == State::Utf8 {
|
||||
if utf8parser.add(b) {
|
||||
*state = State::Ground;
|
||||
}
|
||||
false
|
||||
} else {
|
||||
let (next_state, action) = state_change(State::Ground, b);
|
||||
if next_state != State::Anywhere {
|
||||
*state = next_state;
|
||||
}
|
||||
if *state == State::Utf8 {
|
||||
utf8parser.add(b);
|
||||
false
|
||||
} else {
|
||||
!is_printable_bytes(action, b)
|
||||
}
|
||||
}
|
||||
});
|
||||
let (printable, next) = bytes.split_at(offset.unwrap_or(bytes.len()));
|
||||
*bytes = next;
|
||||
if printable.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(printable)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Default, Clone, Debug, PartialEq, Eq)]
|
||||
pub struct Utf8Parser {
|
||||
utf8_parser: utf8parse::Parser,
|
||||
}
|
||||
|
||||
impl Utf8Parser {
|
||||
fn add(&mut self, byte: u8) -> bool {
|
||||
let mut b = false;
|
||||
let mut receiver = VtUtf8Receiver(&mut b);
|
||||
self.utf8_parser.advance(&mut receiver, byte);
|
||||
b
|
||||
}
|
||||
}
|
||||
|
||||
struct VtUtf8Receiver<'a>(&'a mut bool);
|
||||
|
||||
impl<'a> utf8parse::Receiver for VtUtf8Receiver<'a> {
|
||||
fn codepoint(&mut self, _: char) {
|
||||
*self.0 = true;
|
||||
}
|
||||
|
||||
fn invalid_sequence(&mut self) {
|
||||
*self.0 = true;
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn is_printable_bytes(action: Action, byte: u8) -> bool {
|
||||
// VT320 considered 0x7f to be `Print`able but we expect to be working in UTF-8 systems and not
|
||||
// ISO Latin-1, making it DEL and non-printable
|
||||
const DEL: u8 = 0x7f;
|
||||
|
||||
// Continuations aren't included as they may also be control codes, requiring more context
|
||||
(action == Action::Print && byte != DEL)
|
||||
|| action == Action::BeginUtf8
|
||||
|| (action == Action::Execute && byte.is_ascii_whitespace())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::*;
|
||||
use proptest::prelude::*;
|
||||
|
||||
/// Model based off full parser
|
||||
fn parser_strip(bytes: &[u8]) -> String {
|
||||
#[derive(Default)]
|
||||
struct Strip(String);
|
||||
impl Strip {
|
||||
fn with_capacity(capacity: usize) -> Self {
|
||||
Self(String::with_capacity(capacity))
|
||||
}
|
||||
}
|
||||
impl anstyle_parse::Perform for Strip {
|
||||
fn print(&mut self, c: char) {
|
||||
self.0.push(c);
|
||||
}
|
||||
|
||||
fn execute(&mut self, byte: u8) {
|
||||
if byte.is_ascii_whitespace() {
|
||||
self.0.push(byte as char);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let mut stripped = Strip::with_capacity(bytes.len());
|
||||
let mut parser = anstyle_parse::Parser::<anstyle_parse::DefaultCharAccumulator>::new();
|
||||
for byte in bytes {
|
||||
parser.advance(&mut stripped, *byte);
|
||||
}
|
||||
stripped.0
|
||||
}
|
||||
|
||||
/// Model verifying incremental parsing
|
||||
fn strip_char(mut s: &str) -> String {
|
||||
let mut result = String::new();
|
||||
let mut state = StripStr::new();
|
||||
while !s.is_empty() {
|
||||
let mut indices = s.char_indices();
|
||||
indices.next(); // current
|
||||
let offset = indices.next().map(|(i, _)| i).unwrap_or_else(|| s.len());
|
||||
let (current, remainder) = s.split_at(offset);
|
||||
for printable in state.strip_next(current) {
|
||||
result.push_str(printable);
|
||||
}
|
||||
s = remainder;
|
||||
}
|
||||
result
|
||||
}
|
||||
|
||||
/// Model verifying incremental parsing
|
||||
fn strip_byte(s: &[u8]) -> Vec<u8> {
|
||||
let mut result = Vec::new();
|
||||
let mut state = StripBytes::default();
|
||||
for start in 0..s.len() {
|
||||
let current = &s[start..=start];
|
||||
for printable in state.strip_next(current) {
|
||||
result.extend(printable);
|
||||
}
|
||||
}
|
||||
result
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_strip_bytes_multibyte() {
|
||||
let bytes = [240, 145, 141, 139];
|
||||
let expected = parser_strip(&bytes);
|
||||
let actual = String::from_utf8(strip_bytes(&bytes).into_vec()).unwrap();
|
||||
assert_eq!(expected, actual);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_strip_byte_multibyte() {
|
||||
let bytes = [240, 145, 141, 139];
|
||||
let expected = parser_strip(&bytes);
|
||||
let actual = String::from_utf8(strip_byte(&bytes).to_vec()).unwrap();
|
||||
assert_eq!(expected, actual);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_strip_str_del() {
|
||||
let input = std::str::from_utf8(&[0x7f]).unwrap();
|
||||
let expected = "";
|
||||
let actual = strip_str(input).to_string();
|
||||
assert_eq!(expected, actual);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_strip_byte_del() {
|
||||
let bytes = [0x7f];
|
||||
let expected = "";
|
||||
let actual = String::from_utf8(strip_byte(&bytes).to_vec()).unwrap();
|
||||
assert_eq!(expected, actual);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_strip_str_handles_broken_sequence() {
|
||||
// valid utf8: \xc3\xb6 then \x1b then \xf0\x9f\x98\x80
|
||||
let s = "ö\x1b😀hello😀goodbye";
|
||||
let mut it = strip_str(s);
|
||||
assert_eq!("ö", it.next().unwrap());
|
||||
assert_eq!("ello😀goodbye", it.next().unwrap());
|
||||
}
|
||||
|
||||
proptest! {
|
||||
#[test]
|
||||
#[cfg_attr(miri, ignore)] // See https://github.com/AltSysrq/proptest/issues/253
|
||||
fn strip_str_no_escapes(s in "\\PC*") {
|
||||
let expected = parser_strip(s.as_bytes());
|
||||
let actual = strip_str(&s).to_string();
|
||||
assert_eq!(expected, actual);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg_attr(miri, ignore)] // See https://github.com/AltSysrq/proptest/issues/253
|
||||
fn strip_char_no_escapes(s in "\\PC*") {
|
||||
let expected = parser_strip(s.as_bytes());
|
||||
let actual = strip_char(&s);
|
||||
assert_eq!(expected, actual);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg_attr(miri, ignore)] // See https://github.com/AltSysrq/proptest/issues/253
|
||||
fn strip_bytes_no_escapes(s in "\\PC*") {
|
||||
dbg!(&s);
|
||||
dbg!(s.as_bytes());
|
||||
let expected = parser_strip(s.as_bytes());
|
||||
let actual = String::from_utf8(strip_bytes(s.as_bytes()).into_vec()).unwrap();
|
||||
assert_eq!(expected, actual);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg_attr(miri, ignore)] // See https://github.com/AltSysrq/proptest/issues/253
|
||||
fn strip_byte_no_escapes(s in "\\PC*") {
|
||||
dbg!(&s);
|
||||
dbg!(s.as_bytes());
|
||||
let expected = parser_strip(s.as_bytes());
|
||||
let actual = String::from_utf8(strip_byte(s.as_bytes()).to_vec()).unwrap();
|
||||
assert_eq!(expected, actual);
|
||||
}
|
||||
}
|
||||
}
|
383
vendor/anstream/src/adapter/wincon.rs
vendored
Normal file
383
vendor/anstream/src/adapter/wincon.rs
vendored
Normal file
@ -0,0 +1,383 @@
|
||||
/// Incrementally convert to wincon calls for non-contiguous data
|
||||
#[derive(Default, Clone, Debug, PartialEq, Eq)]
|
||||
pub struct WinconBytes {
|
||||
parser: anstyle_parse::Parser,
|
||||
capture: WinconCapture,
|
||||
}
|
||||
|
||||
impl WinconBytes {
|
||||
/// Initial state
|
||||
pub fn new() -> Self {
|
||||
Default::default()
|
||||
}
|
||||
|
||||
/// Strip the next segment of data
|
||||
pub fn extract_next<'s>(&'s mut self, bytes: &'s [u8]) -> WinconBytesIter<'s> {
|
||||
self.capture.reset();
|
||||
self.capture.printable.reserve(bytes.len());
|
||||
WinconBytesIter {
|
||||
bytes,
|
||||
parser: &mut self.parser,
|
||||
capture: &mut self.capture,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// See [`WinconBytes`]
|
||||
#[derive(Debug, PartialEq, Eq)]
|
||||
pub struct WinconBytesIter<'s> {
|
||||
bytes: &'s [u8],
|
||||
parser: &'s mut anstyle_parse::Parser,
|
||||
capture: &'s mut WinconCapture,
|
||||
}
|
||||
|
||||
impl<'s> Iterator for WinconBytesIter<'s> {
|
||||
type Item = (anstyle::Style, String);
|
||||
|
||||
#[inline]
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
next_bytes(&mut self.bytes, self.parser, self.capture)
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn next_bytes(
|
||||
bytes: &mut &[u8],
|
||||
parser: &mut anstyle_parse::Parser,
|
||||
capture: &mut WinconCapture,
|
||||
) -> Option<(anstyle::Style, String)> {
|
||||
capture.reset();
|
||||
while capture.ready.is_none() {
|
||||
let byte = if let Some((byte, remainder)) = (*bytes).split_first() {
|
||||
*bytes = remainder;
|
||||
*byte
|
||||
} else {
|
||||
break;
|
||||
};
|
||||
parser.advance(capture, byte);
|
||||
}
|
||||
if capture.printable.is_empty() {
|
||||
return None;
|
||||
}
|
||||
|
||||
let style = capture.ready.unwrap_or(capture.style);
|
||||
Some((style, std::mem::take(&mut capture.printable)))
|
||||
}
|
||||
|
||||
#[derive(Default, Clone, Debug, PartialEq, Eq)]
|
||||
struct WinconCapture {
|
||||
style: anstyle::Style,
|
||||
printable: String,
|
||||
ready: Option<anstyle::Style>,
|
||||
}
|
||||
|
||||
impl WinconCapture {
|
||||
fn reset(&mut self) {
|
||||
self.ready = None;
|
||||
}
|
||||
}
|
||||
|
||||
impl anstyle_parse::Perform for WinconCapture {
|
||||
/// Draw a character to the screen and update states.
|
||||
fn print(&mut self, c: char) {
|
||||
self.printable.push(c);
|
||||
}
|
||||
|
||||
/// Execute a C0 or C1 control function.
|
||||
fn execute(&mut self, byte: u8) {
|
||||
if byte.is_ascii_whitespace() {
|
||||
self.printable.push(byte as char);
|
||||
}
|
||||
}
|
||||
|
||||
fn csi_dispatch(
|
||||
&mut self,
|
||||
params: &anstyle_parse::Params,
|
||||
_intermediates: &[u8],
|
||||
ignore: bool,
|
||||
action: u8,
|
||||
) {
|
||||
if ignore {
|
||||
return;
|
||||
}
|
||||
if action != b'm' {
|
||||
return;
|
||||
}
|
||||
|
||||
let mut style = self.style;
|
||||
// param/value differences are dependent on the escape code
|
||||
let mut state = State::Normal;
|
||||
let mut r = None;
|
||||
let mut g = None;
|
||||
let mut color_target = ColorTarget::Fg;
|
||||
for param in params {
|
||||
for value in param {
|
||||
match (state, *value) {
|
||||
(State::Normal, 0) => {
|
||||
style = anstyle::Style::default();
|
||||
break;
|
||||
}
|
||||
(State::Normal, 1) => {
|
||||
style = style.bold();
|
||||
break;
|
||||
}
|
||||
(State::Normal, 2) => {
|
||||
style = style.dimmed();
|
||||
break;
|
||||
}
|
||||
(State::Normal, 3) => {
|
||||
style = style.italic();
|
||||
break;
|
||||
}
|
||||
(State::Normal, 4) => {
|
||||
style = style.underline();
|
||||
state = State::Underline;
|
||||
}
|
||||
(State::Normal, 21) => {
|
||||
style |= anstyle::Effects::DOUBLE_UNDERLINE;
|
||||
break;
|
||||
}
|
||||
(State::Normal, 7) => {
|
||||
style = style.invert();
|
||||
break;
|
||||
}
|
||||
(State::Normal, 8) => {
|
||||
style = style.hidden();
|
||||
break;
|
||||
}
|
||||
(State::Normal, 9) => {
|
||||
style = style.strikethrough();
|
||||
break;
|
||||
}
|
||||
(State::Normal, 30..=37) => {
|
||||
let color = to_ansi_color(value - 30).unwrap();
|
||||
style = style.fg_color(Some(color.into()));
|
||||
break;
|
||||
}
|
||||
(State::Normal, 38) => {
|
||||
color_target = ColorTarget::Fg;
|
||||
state = State::PrepareCustomColor;
|
||||
}
|
||||
(State::Normal, 39) => {
|
||||
style = style.fg_color(None);
|
||||
break;
|
||||
}
|
||||
(State::Normal, 40..=47) => {
|
||||
let color = to_ansi_color(value - 40).unwrap();
|
||||
style = style.bg_color(Some(color.into()));
|
||||
break;
|
||||
}
|
||||
(State::Normal, 48) => {
|
||||
color_target = ColorTarget::Bg;
|
||||
state = State::PrepareCustomColor;
|
||||
}
|
||||
(State::Normal, 49) => {
|
||||
style = style.bg_color(None);
|
||||
break;
|
||||
}
|
||||
(State::Normal, 58) => {
|
||||
color_target = ColorTarget::Underline;
|
||||
state = State::PrepareCustomColor;
|
||||
}
|
||||
(State::Normal, 90..=97) => {
|
||||
let color = to_ansi_color(value - 90).unwrap().bright(true);
|
||||
style = style.fg_color(Some(color.into()));
|
||||
break;
|
||||
}
|
||||
(State::Normal, 100..=107) => {
|
||||
let color = to_ansi_color(value - 100).unwrap().bright(true);
|
||||
style = style.bg_color(Some(color.into()));
|
||||
break;
|
||||
}
|
||||
(State::PrepareCustomColor, 5) => {
|
||||
state = State::Ansi256;
|
||||
}
|
||||
(State::PrepareCustomColor, 2) => {
|
||||
state = State::Rgb;
|
||||
r = None;
|
||||
g = None;
|
||||
}
|
||||
(State::Ansi256, n) => {
|
||||
let color = anstyle::Ansi256Color(n as u8);
|
||||
style = match color_target {
|
||||
ColorTarget::Fg => style.fg_color(Some(color.into())),
|
||||
ColorTarget::Bg => style.bg_color(Some(color.into())),
|
||||
ColorTarget::Underline => style.underline_color(Some(color.into())),
|
||||
};
|
||||
break;
|
||||
}
|
||||
(State::Rgb, b) => match (r, g) {
|
||||
(None, _) => {
|
||||
r = Some(b);
|
||||
}
|
||||
(Some(_), None) => {
|
||||
g = Some(b);
|
||||
}
|
||||
(Some(r), Some(g)) => {
|
||||
let color = anstyle::RgbColor(r as u8, g as u8, b as u8);
|
||||
style = match color_target {
|
||||
ColorTarget::Fg => style.fg_color(Some(color.into())),
|
||||
ColorTarget::Bg => style.bg_color(Some(color.into())),
|
||||
ColorTarget::Underline => style.underline_color(Some(color.into())),
|
||||
};
|
||||
break;
|
||||
}
|
||||
},
|
||||
(State::Underline, 0) => {
|
||||
style =
|
||||
style.effects(style.get_effects().remove(anstyle::Effects::UNDERLINE));
|
||||
}
|
||||
(State::Underline, 1) => {
|
||||
// underline already set
|
||||
}
|
||||
(State::Underline, 2) => {
|
||||
style = style
|
||||
.effects(style.get_effects().remove(anstyle::Effects::UNDERLINE))
|
||||
| anstyle::Effects::DOUBLE_UNDERLINE;
|
||||
}
|
||||
(State::Underline, 3) => {
|
||||
style = style
|
||||
.effects(style.get_effects().remove(anstyle::Effects::UNDERLINE))
|
||||
| anstyle::Effects::CURLY_UNDERLINE;
|
||||
}
|
||||
(State::Underline, 4) => {
|
||||
style = style
|
||||
.effects(style.get_effects().remove(anstyle::Effects::UNDERLINE))
|
||||
| anstyle::Effects::DOTTED_UNDERLINE;
|
||||
}
|
||||
(State::Underline, 5) => {
|
||||
style = style
|
||||
.effects(style.get_effects().remove(anstyle::Effects::UNDERLINE))
|
||||
| anstyle::Effects::DASHED_UNDERLINE;
|
||||
}
|
||||
_ => {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if style != self.style && !self.printable.is_empty() {
|
||||
self.ready = Some(self.style);
|
||||
}
|
||||
self.style = style;
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, PartialEq, Eq, Debug)]
|
||||
enum State {
|
||||
Normal,
|
||||
PrepareCustomColor,
|
||||
Ansi256,
|
||||
Rgb,
|
||||
Underline,
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, PartialEq, Eq, Debug)]
|
||||
enum ColorTarget {
|
||||
Fg,
|
||||
Bg,
|
||||
Underline,
|
||||
}
|
||||
|
||||
fn to_ansi_color(digit: u16) -> Option<anstyle::AnsiColor> {
|
||||
match digit {
|
||||
0 => Some(anstyle::AnsiColor::Black),
|
||||
1 => Some(anstyle::AnsiColor::Red),
|
||||
2 => Some(anstyle::AnsiColor::Green),
|
||||
3 => Some(anstyle::AnsiColor::Yellow),
|
||||
4 => Some(anstyle::AnsiColor::Blue),
|
||||
5 => Some(anstyle::AnsiColor::Magenta),
|
||||
6 => Some(anstyle::AnsiColor::Cyan),
|
||||
7 => Some(anstyle::AnsiColor::White),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::*;
|
||||
use owo_colors::OwoColorize as _;
|
||||
use proptest::prelude::*;
|
||||
|
||||
#[track_caller]
|
||||
fn verify(input: &str, expected: Vec<(anstyle::Style, &str)>) {
|
||||
let expected = expected
|
||||
.into_iter()
|
||||
.map(|(style, value)| (style, value.to_owned()))
|
||||
.collect::<Vec<_>>();
|
||||
let mut state = WinconBytes::new();
|
||||
let actual = state.extract_next(input.as_bytes()).collect::<Vec<_>>();
|
||||
assert_eq!(expected, actual, "{input:?}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn start() {
|
||||
let input = format!("{} world!", "Hello".green().on_red());
|
||||
let expected = vec![
|
||||
(
|
||||
anstyle::AnsiColor::Green.on(anstyle::AnsiColor::Red),
|
||||
"Hello",
|
||||
),
|
||||
(anstyle::Style::default(), " world!"),
|
||||
];
|
||||
verify(&input, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn middle() {
|
||||
let input = format!("Hello {}!", "world".green().on_red());
|
||||
let expected = vec![
|
||||
(anstyle::Style::default(), "Hello "),
|
||||
(
|
||||
anstyle::AnsiColor::Green.on(anstyle::AnsiColor::Red),
|
||||
"world",
|
||||
),
|
||||
(anstyle::Style::default(), "!"),
|
||||
];
|
||||
verify(&input, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn end() {
|
||||
let input = format!("Hello {}", "world!".green().on_red());
|
||||
let expected = vec![
|
||||
(anstyle::Style::default(), "Hello "),
|
||||
(
|
||||
anstyle::AnsiColor::Green.on(anstyle::AnsiColor::Red),
|
||||
"world!",
|
||||
),
|
||||
];
|
||||
verify(&input, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn ansi256_colors() {
|
||||
// termcolor only supports "brights" via these
|
||||
let input = format!(
|
||||
"Hello {}!",
|
||||
"world".color(owo_colors::XtermColors::UserBrightYellow)
|
||||
);
|
||||
let expected = vec![
|
||||
(anstyle::Style::default(), "Hello "),
|
||||
(anstyle::Ansi256Color(11).on_default(), "world"),
|
||||
(anstyle::Style::default(), "!"),
|
||||
];
|
||||
verify(&input, expected);
|
||||
}
|
||||
|
||||
proptest! {
|
||||
#[test]
|
||||
#[cfg_attr(miri, ignore)] // See https://github.com/AltSysrq/proptest/issues/253
|
||||
fn wincon_no_escapes(s in "\\PC*") {
|
||||
let expected = if s.is_empty() {
|
||||
vec![]
|
||||
} else {
|
||||
vec![(anstyle::Style::default(), s.clone())]
|
||||
};
|
||||
let mut state = WinconBytes::new();
|
||||
let actual = state.extract_next(s.as_bytes()).collect::<Vec<_>>();
|
||||
assert_eq!(expected, actual);
|
||||
}
|
||||
}
|
||||
}
|
301
vendor/anstream/src/auto.rs
vendored
Normal file
301
vendor/anstream/src/auto.rs
vendored
Normal file
@ -0,0 +1,301 @@
|
||||
use crate::stream::AsLockedWrite;
|
||||
use crate::stream::RawStream;
|
||||
use crate::ColorChoice;
|
||||
use crate::StripStream;
|
||||
#[cfg(all(windows, feature = "wincon"))]
|
||||
use crate::WinconStream;
|
||||
|
||||
/// [`std::io::Write`] that adapts ANSI escape codes to the underlying `Write`s capabilities
|
||||
///
|
||||
/// This includes
|
||||
/// - Stripping colors for non-terminals
|
||||
/// - Respecting env variables like [NO_COLOR](https://no-color.org/) or [CLICOLOR](https://bixense.com/clicolors/)
|
||||
/// - *(windows)* Falling back to the wincon API where [ENABLE_VIRTUAL_TERMINAL_PROCESSING](https://learn.microsoft.com/en-us/windows/console/console-virtual-terminal-sequences#output-sequences) is unsupported
|
||||
///
|
||||
/// You can customize auto-detection by calling into
|
||||
/// [anstyle_query](https://docs.rs/anstyle-query/latest/anstyle_query/)
|
||||
/// to get a [`ColorChoice`] and then calling [`AutoStream::new(stream, choice)`].
|
||||
#[derive(Debug)]
|
||||
pub struct AutoStream<S: RawStream> {
|
||||
inner: StreamInner<S>,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
enum StreamInner<S: RawStream> {
|
||||
PassThrough(S),
|
||||
Strip(StripStream<S>),
|
||||
#[cfg(all(windows, feature = "wincon"))]
|
||||
Wincon(WinconStream<S>),
|
||||
}
|
||||
|
||||
impl<S> AutoStream<S>
|
||||
where
|
||||
S: RawStream,
|
||||
{
|
||||
/// Runtime control over styling behavior
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```rust
|
||||
/// # #[cfg(feature = "auto")] {
|
||||
/// # use std::io::IsTerminal as _;
|
||||
/// // Like `AutoStream::choice` but without `NO_COLOR`, `CLICOLOR_FORCE`, `CI`
|
||||
/// fn choice(raw: &dyn anstream::stream::RawStream) -> anstream::ColorChoice {
|
||||
/// let choice = anstream::ColorChoice::global();
|
||||
/// if choice == anstream::ColorChoice::Auto {
|
||||
/// if raw.is_terminal() && anstyle_query::term_supports_color() {
|
||||
/// anstream::ColorChoice::Always
|
||||
/// } else {
|
||||
/// anstream::ColorChoice::Never
|
||||
/// }
|
||||
/// } else {
|
||||
/// choice
|
||||
/// }
|
||||
/// }
|
||||
///
|
||||
/// let stream = std::io::stdout();
|
||||
/// let choice = choice(&stream);
|
||||
/// let auto = anstream::AutoStream::new(stream, choice);
|
||||
/// # }
|
||||
/// ```
|
||||
#[inline]
|
||||
pub fn new(raw: S, choice: ColorChoice) -> Self {
|
||||
match choice {
|
||||
#[cfg(feature = "auto")]
|
||||
ColorChoice::Auto => Self::auto(raw),
|
||||
#[cfg(not(feature = "auto"))]
|
||||
ColorChoice::Auto => Self::never(raw),
|
||||
ColorChoice::AlwaysAnsi => Self::always_ansi(raw),
|
||||
ColorChoice::Always => Self::always(raw),
|
||||
ColorChoice::Never => Self::never(raw),
|
||||
}
|
||||
}
|
||||
|
||||
/// Auto-adapt for the stream's capabilities
|
||||
#[cfg(feature = "auto")]
|
||||
#[inline]
|
||||
pub fn auto(raw: S) -> Self {
|
||||
let choice = Self::choice(&raw);
|
||||
debug_assert_ne!(choice, ColorChoice::Auto);
|
||||
Self::new(raw, choice)
|
||||
}
|
||||
|
||||
/// Report the desired choice for the given stream
|
||||
#[cfg(feature = "auto")]
|
||||
pub fn choice(raw: &S) -> ColorChoice {
|
||||
choice(raw)
|
||||
}
|
||||
|
||||
/// Force ANSI escape codes to be passed through as-is, no matter what the inner `Write`
|
||||
/// supports.
|
||||
#[inline]
|
||||
pub fn always_ansi(raw: S) -> Self {
|
||||
#[cfg(feature = "auto")]
|
||||
{
|
||||
if raw.is_terminal() {
|
||||
let _ = anstyle_query::windows::enable_ansi_colors();
|
||||
}
|
||||
}
|
||||
Self::always_ansi_(raw)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn always_ansi_(raw: S) -> Self {
|
||||
let inner = StreamInner::PassThrough(raw);
|
||||
AutoStream { inner }
|
||||
}
|
||||
|
||||
/// Force color, no matter what the inner `Write` supports.
|
||||
#[inline]
|
||||
pub fn always(raw: S) -> Self {
|
||||
if cfg!(windows) {
|
||||
#[cfg(feature = "auto")]
|
||||
let use_wincon = raw.is_terminal()
|
||||
&& !anstyle_query::windows::enable_ansi_colors().unwrap_or(true)
|
||||
&& !anstyle_query::term_supports_ansi_color();
|
||||
#[cfg(not(feature = "auto"))]
|
||||
let use_wincon = true;
|
||||
if use_wincon {
|
||||
Self::wincon(raw).unwrap_or_else(|raw| Self::always_ansi_(raw))
|
||||
} else {
|
||||
Self::always_ansi_(raw)
|
||||
}
|
||||
} else {
|
||||
Self::always_ansi(raw)
|
||||
}
|
||||
}
|
||||
|
||||
/// Only pass printable data to the inner `Write`.
|
||||
#[inline]
|
||||
pub fn never(raw: S) -> Self {
|
||||
let inner = StreamInner::Strip(StripStream::new(raw));
|
||||
AutoStream { inner }
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn wincon(raw: S) -> Result<Self, S> {
|
||||
#[cfg(all(windows, feature = "wincon"))]
|
||||
{
|
||||
Ok(Self {
|
||||
inner: StreamInner::Wincon(WinconStream::new(raw)),
|
||||
})
|
||||
}
|
||||
#[cfg(not(all(windows, feature = "wincon")))]
|
||||
{
|
||||
Err(raw)
|
||||
}
|
||||
}
|
||||
|
||||
/// Get the wrapped [`RawStream`]
|
||||
#[inline]
|
||||
pub fn into_inner(self) -> S {
|
||||
match self.inner {
|
||||
StreamInner::PassThrough(w) => w,
|
||||
StreamInner::Strip(w) => w.into_inner(),
|
||||
#[cfg(all(windows, feature = "wincon"))]
|
||||
StreamInner::Wincon(w) => w.into_inner(),
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn is_terminal(&self) -> bool {
|
||||
match &self.inner {
|
||||
StreamInner::PassThrough(w) => w.is_terminal(),
|
||||
StreamInner::Strip(w) => w.is_terminal(),
|
||||
#[cfg(all(windows, feature = "wincon"))]
|
||||
StreamInner::Wincon(_) => true, // its only ever a terminal
|
||||
}
|
||||
}
|
||||
|
||||
/// Prefer [`AutoStream::choice`]
|
||||
///
|
||||
/// This doesn't report what is requested but what is currently active.
|
||||
#[inline]
|
||||
#[cfg(feature = "auto")]
|
||||
pub fn current_choice(&self) -> ColorChoice {
|
||||
match &self.inner {
|
||||
StreamInner::PassThrough(_) => ColorChoice::AlwaysAnsi,
|
||||
StreamInner::Strip(_) => ColorChoice::Never,
|
||||
#[cfg(all(windows, feature = "wincon"))]
|
||||
StreamInner::Wincon(_) => ColorChoice::Always,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "auto")]
|
||||
fn choice(raw: &dyn RawStream) -> ColorChoice {
|
||||
let choice = ColorChoice::global();
|
||||
match choice {
|
||||
ColorChoice::Auto => {
|
||||
let clicolor = anstyle_query::clicolor();
|
||||
let clicolor_enabled = clicolor.unwrap_or(false);
|
||||
let clicolor_disabled = !clicolor.unwrap_or(true);
|
||||
if anstyle_query::no_color() {
|
||||
ColorChoice::Never
|
||||
} else if anstyle_query::clicolor_force() {
|
||||
ColorChoice::Always
|
||||
} else if clicolor_disabled {
|
||||
ColorChoice::Never
|
||||
} else if raw.is_terminal()
|
||||
&& (anstyle_query::term_supports_color()
|
||||
|| clicolor_enabled
|
||||
|| anstyle_query::is_ci())
|
||||
{
|
||||
ColorChoice::Always
|
||||
} else {
|
||||
ColorChoice::Never
|
||||
}
|
||||
}
|
||||
ColorChoice::AlwaysAnsi | ColorChoice::Always | ColorChoice::Never => choice,
|
||||
}
|
||||
}
|
||||
|
||||
impl AutoStream<std::io::Stdout> {
|
||||
/// Get exclusive access to the `AutoStream`
|
||||
///
|
||||
/// Why?
|
||||
/// - Faster performance when writing in a loop
|
||||
/// - Avoid other threads interleaving output with the current thread
|
||||
#[inline]
|
||||
pub fn lock(self) -> AutoStream<std::io::StdoutLock<'static>> {
|
||||
let inner = match self.inner {
|
||||
StreamInner::PassThrough(w) => StreamInner::PassThrough(w.lock()),
|
||||
StreamInner::Strip(w) => StreamInner::Strip(w.lock()),
|
||||
#[cfg(all(windows, feature = "wincon"))]
|
||||
StreamInner::Wincon(w) => StreamInner::Wincon(w.lock()),
|
||||
};
|
||||
AutoStream { inner }
|
||||
}
|
||||
}
|
||||
|
||||
impl AutoStream<std::io::Stderr> {
|
||||
/// Get exclusive access to the `AutoStream`
|
||||
///
|
||||
/// Why?
|
||||
/// - Faster performance when writing in a loop
|
||||
/// - Avoid other threads interleaving output with the current thread
|
||||
#[inline]
|
||||
pub fn lock(self) -> AutoStream<std::io::StderrLock<'static>> {
|
||||
let inner = match self.inner {
|
||||
StreamInner::PassThrough(w) => StreamInner::PassThrough(w.lock()),
|
||||
StreamInner::Strip(w) => StreamInner::Strip(w.lock()),
|
||||
#[cfg(all(windows, feature = "wincon"))]
|
||||
StreamInner::Wincon(w) => StreamInner::Wincon(w.lock()),
|
||||
};
|
||||
AutoStream { inner }
|
||||
}
|
||||
}
|
||||
|
||||
impl<S> std::io::Write for AutoStream<S>
|
||||
where
|
||||
S: RawStream + AsLockedWrite,
|
||||
{
|
||||
// Must forward all calls to ensure locking happens appropriately
|
||||
#[inline]
|
||||
fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
|
||||
match &mut self.inner {
|
||||
StreamInner::PassThrough(w) => w.as_locked_write().write(buf),
|
||||
StreamInner::Strip(w) => w.write(buf),
|
||||
#[cfg(all(windows, feature = "wincon"))]
|
||||
StreamInner::Wincon(w) => w.write(buf),
|
||||
}
|
||||
}
|
||||
#[inline]
|
||||
fn write_vectored(&mut self, bufs: &[std::io::IoSlice<'_>]) -> std::io::Result<usize> {
|
||||
match &mut self.inner {
|
||||
StreamInner::PassThrough(w) => w.as_locked_write().write_vectored(bufs),
|
||||
StreamInner::Strip(w) => w.write_vectored(bufs),
|
||||
#[cfg(all(windows, feature = "wincon"))]
|
||||
StreamInner::Wincon(w) => w.write_vectored(bufs),
|
||||
}
|
||||
}
|
||||
// is_write_vectored: nightly only
|
||||
#[inline]
|
||||
fn flush(&mut self) -> std::io::Result<()> {
|
||||
match &mut self.inner {
|
||||
StreamInner::PassThrough(w) => w.as_locked_write().flush(),
|
||||
StreamInner::Strip(w) => w.flush(),
|
||||
#[cfg(all(windows, feature = "wincon"))]
|
||||
StreamInner::Wincon(w) => w.flush(),
|
||||
}
|
||||
}
|
||||
#[inline]
|
||||
fn write_all(&mut self, buf: &[u8]) -> std::io::Result<()> {
|
||||
match &mut self.inner {
|
||||
StreamInner::PassThrough(w) => w.as_locked_write().write_all(buf),
|
||||
StreamInner::Strip(w) => w.write_all(buf),
|
||||
#[cfg(all(windows, feature = "wincon"))]
|
||||
StreamInner::Wincon(w) => w.write_all(buf),
|
||||
}
|
||||
}
|
||||
// write_all_vectored: nightly only
|
||||
#[inline]
|
||||
fn write_fmt(&mut self, args: std::fmt::Arguments<'_>) -> std::io::Result<()> {
|
||||
match &mut self.inner {
|
||||
StreamInner::PassThrough(w) => w.as_locked_write().write_fmt(args),
|
||||
StreamInner::Strip(w) => w.write_fmt(args),
|
||||
#[cfg(all(windows, feature = "wincon"))]
|
||||
StreamInner::Wincon(w) => w.write_fmt(args),
|
||||
}
|
||||
}
|
||||
}
|
68
vendor/anstream/src/buffer.rs
vendored
Normal file
68
vendor/anstream/src/buffer.rs
vendored
Normal file
@ -0,0 +1,68 @@
|
||||
#![allow(deprecated)]
|
||||
|
||||
/// In-memory [`RawStream`][crate::stream::RawStream]
|
||||
#[derive(Clone, Default, Debug, PartialEq, Eq)]
|
||||
#[deprecated(since = "0.6.2", note = "Use Vec")]
|
||||
#[doc(hidden)]
|
||||
pub struct Buffer(Vec<u8>);
|
||||
|
||||
impl Buffer {
|
||||
#[inline]
|
||||
pub fn new() -> Self {
|
||||
Default::default()
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn with_capacity(capacity: usize) -> Self {
|
||||
Self(Vec::with_capacity(capacity))
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn as_bytes(&self) -> &[u8] {
|
||||
&self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl AsRef<[u8]> for Buffer {
|
||||
#[inline]
|
||||
fn as_ref(&self) -> &[u8] {
|
||||
self.as_bytes()
|
||||
}
|
||||
}
|
||||
|
||||
impl std::io::Write for Buffer {
|
||||
#[inline]
|
||||
fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
|
||||
self.0.extend(buf);
|
||||
Ok(buf.len())
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn flush(&mut self) -> std::io::Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(all(windows, feature = "wincon"))]
|
||||
impl anstyle_wincon::WinconStream for Buffer {
|
||||
fn write_colored(
|
||||
&mut self,
|
||||
fg: Option<anstyle::AnsiColor>,
|
||||
bg: Option<anstyle::AnsiColor>,
|
||||
data: &[u8],
|
||||
) -> std::io::Result<usize> {
|
||||
self.0.write_colored(fg, bg, data)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(all(windows, feature = "wincon"))]
|
||||
impl anstyle_wincon::WinconStream for &'_ mut Buffer {
|
||||
fn write_colored(
|
||||
&mut self,
|
||||
fg: Option<anstyle::AnsiColor>,
|
||||
bg: Option<anstyle::AnsiColor>,
|
||||
data: &[u8],
|
||||
) -> std::io::Result<usize> {
|
||||
(**self).write_colored(fg, bg, data)
|
||||
}
|
||||
}
|
54
vendor/anstream/src/fmt.rs
vendored
Normal file
54
vendor/anstream/src/fmt.rs
vendored
Normal file
@ -0,0 +1,54 @@
|
||||
/// A shim which allows a [`std::io::Write`] to be implemented in terms of a [`std::fmt::Write`]
|
||||
///
|
||||
/// This saves off I/O errors. instead of discarding them
|
||||
pub(crate) struct Adapter<W>
|
||||
where
|
||||
W: FnMut(&[u8]) -> std::io::Result<()>,
|
||||
{
|
||||
writer: W,
|
||||
error: std::io::Result<()>,
|
||||
}
|
||||
|
||||
impl<W> Adapter<W>
|
||||
where
|
||||
W: FnMut(&[u8]) -> std::io::Result<()>,
|
||||
{
|
||||
pub(crate) fn new(writer: W) -> Self {
|
||||
Adapter {
|
||||
writer,
|
||||
error: Ok(()),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn write_fmt(mut self, fmt: std::fmt::Arguments<'_>) -> std::io::Result<()> {
|
||||
match std::fmt::write(&mut self, fmt) {
|
||||
Ok(()) => Ok(()),
|
||||
Err(..) => {
|
||||
// check if the error came from the underlying `Write` or not
|
||||
if self.error.is_err() {
|
||||
self.error
|
||||
} else {
|
||||
Err(std::io::Error::new(
|
||||
std::io::ErrorKind::Other,
|
||||
"formatter error",
|
||||
))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<W> std::fmt::Write for Adapter<W>
|
||||
where
|
||||
W: FnMut(&[u8]) -> std::io::Result<()>,
|
||||
{
|
||||
fn write_str(&mut self, s: &str) -> std::fmt::Result {
|
||||
match (self.writer)(s.as_bytes()) {
|
||||
Ok(()) => Ok(()),
|
||||
Err(e) => {
|
||||
self.error = Err(e);
|
||||
Err(std::fmt::Error)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
81
vendor/anstream/src/lib.rs
vendored
Normal file
81
vendor/anstream/src/lib.rs
vendored
Normal file
@ -0,0 +1,81 @@
|
||||
//! **Auto-adapting [`stdout`] / [`stderr`] streams**
|
||||
//!
|
||||
//! *A portmanteau of "ansi stream"*
|
||||
//!
|
||||
//! [`AutoStream`] always accepts [ANSI escape codes](https://en.wikipedia.org/wiki/ANSI_escape_code),
|
||||
//! [adapting to the user's terminal's capabilities][AutoStream].
|
||||
//!
|
||||
//! Benefits
|
||||
//! - Allows the caller to not be concerned with the terminal's capabilities
|
||||
//! - Semver safe way of passing styled text between crates as ANSI escape codes offer more
|
||||
//! compatibility than most crate APIs.
|
||||
//!
|
||||
//! Available styling crates:
|
||||
//! - [anstyle](https://docs.rs/anstyle) for minimal runtime styling, designed to go in public APIs
|
||||
//! (once it hits 1.0)
|
||||
//! - [owo-colors](https://docs.rs/owo-colors) for feature-rich runtime styling
|
||||
//! - [color-print](https://docs.rs/color-print) for feature-rich compile-time styling
|
||||
//!
|
||||
//! # Example
|
||||
//!
|
||||
//! ```
|
||||
//! # #[cfg(feature = "auto")] {
|
||||
//! use anstream::println;
|
||||
//! use owo_colors::OwoColorize as _;
|
||||
//!
|
||||
//! // Foreground colors
|
||||
//! println!("My number is {:#x}!", 10.green());
|
||||
//! // Background colors
|
||||
//! println!("My number is not {}!", 4.on_red());
|
||||
//! # }
|
||||
//! ```
|
||||
//!
|
||||
//! And this will correctly handle piping to a file, etc
|
||||
|
||||
#![cfg_attr(docsrs, feature(doc_auto_cfg))]
|
||||
|
||||
pub mod adapter;
|
||||
pub mod stream;
|
||||
|
||||
mod buffer;
|
||||
#[macro_use]
|
||||
mod macros;
|
||||
mod auto;
|
||||
mod fmt;
|
||||
mod strip;
|
||||
#[cfg(all(windows, feature = "wincon"))]
|
||||
mod wincon;
|
||||
|
||||
pub use auto::AutoStream;
|
||||
pub use strip::StripStream;
|
||||
#[cfg(all(windows, feature = "wincon"))]
|
||||
pub use wincon::WinconStream;
|
||||
|
||||
#[allow(deprecated)]
|
||||
pub use buffer::Buffer;
|
||||
|
||||
pub type Stdout = AutoStream<std::io::Stdout>;
|
||||
pub type Stderr = AutoStream<std::io::Stderr>;
|
||||
|
||||
/// Create an ANSI escape code compatible stdout
|
||||
///
|
||||
/// **Note:** Call [`AutoStream::lock`] in loops to avoid the performance hit of acquiring/releasing
|
||||
/// from the implicit locking in each [`std::io::Write`] call
|
||||
#[cfg(feature = "auto")]
|
||||
pub fn stdout() -> Stdout {
|
||||
let stdout = std::io::stdout();
|
||||
AutoStream::auto(stdout)
|
||||
}
|
||||
|
||||
/// Create an ANSI escape code compatible stderr
|
||||
///
|
||||
/// **Note:** Call [`AutoStream::lock`] in loops to avoid the performance hit of acquiring/releasing
|
||||
/// from the implicit locking in each [`std::io::Write`] call
|
||||
#[cfg(feature = "auto")]
|
||||
pub fn stderr() -> Stderr {
|
||||
let stderr = std::io::stderr();
|
||||
AutoStream::auto(stderr)
|
||||
}
|
||||
|
||||
/// Selection for overriding color output
|
||||
pub use colorchoice::ColorChoice;
|
389
vendor/anstream/src/macros.rs
vendored
Normal file
389
vendor/anstream/src/macros.rs
vendored
Normal file
@ -0,0 +1,389 @@
|
||||
/// Prints to [`stdout`][crate::stdout].
|
||||
///
|
||||
/// Equivalent to the [`println!`] macro except that a newline is not printed at
|
||||
/// the end of the message.
|
||||
///
|
||||
/// Note that stdout is frequently line-buffered by default so it may be
|
||||
/// necessary to use [`std::io::Write::flush()`] to ensure the output is emitted
|
||||
/// immediately.
|
||||
///
|
||||
/// **NOTE:** The `print!` macro will lock the standard output on each call. If you call
|
||||
/// `print!` within a hot loop, this behavior may be the bottleneck of the loop.
|
||||
/// To avoid this, lock stdout with [`AutoStream::lock`][crate::AutoStream::lock]:
|
||||
/// ```
|
||||
/// # #[cfg(feature = "auto")] {
|
||||
/// use std::io::Write as _;
|
||||
///
|
||||
/// let mut lock = anstream::stdout().lock();
|
||||
/// write!(lock, "hello world").unwrap();
|
||||
/// # }
|
||||
/// ```
|
||||
///
|
||||
/// Use `print!` only for the primary output of your program. Use
|
||||
/// [`eprint!`] instead to print error and progress messages.
|
||||
///
|
||||
/// **NOTE:** Not all `print!` calls will be captured in tests like [`std::print!`]
|
||||
/// - Capturing will automatically be activated in test binaries
|
||||
/// - Otherwise, only when the `test` feature is enabled
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// Panics if writing to `stdout` fails for any reason **except** broken pipe.
|
||||
///
|
||||
/// Writing to non-blocking stdout can cause an error, which will lead
|
||||
/// this macro to panic.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// ```
|
||||
/// # #[cfg(feature = "auto")] {
|
||||
/// use std::io::Write as _;
|
||||
/// use anstream::print;
|
||||
/// use anstream::stdout;
|
||||
///
|
||||
/// print!("this ");
|
||||
/// print!("will ");
|
||||
/// print!("be ");
|
||||
/// print!("on ");
|
||||
/// print!("the ");
|
||||
/// print!("same ");
|
||||
/// print!("line ");
|
||||
///
|
||||
/// stdout().flush().unwrap();
|
||||
///
|
||||
/// print!("this string has a newline, why not choose println! instead?\n");
|
||||
///
|
||||
/// stdout().flush().unwrap();
|
||||
/// # }
|
||||
/// ```
|
||||
#[cfg(feature = "auto")]
|
||||
#[macro_export]
|
||||
macro_rules! print {
|
||||
($($arg:tt)*) => {{
|
||||
if cfg!(any(feature = "test", test)) {
|
||||
use std::io::Write as _;
|
||||
|
||||
let stdio = std::io::stdout();
|
||||
let choice = $crate::AutoStream::choice(&stdio);
|
||||
let buffer = Vec::new();
|
||||
let mut stream = $crate::AutoStream::new(buffer, choice);
|
||||
// Ignore errors rather than panic
|
||||
let _ = ::std::write!(&mut stream, $($arg)*);
|
||||
let buffer = stream.into_inner();
|
||||
// Should be UTF-8 but not wanting to panic
|
||||
let buffer = String::from_utf8_lossy(&buffer);
|
||||
::std::print!("{}", buffer)
|
||||
} else {
|
||||
use std::io::Write as _;
|
||||
|
||||
let mut stream = $crate::stdout();
|
||||
match ::std::write!(&mut stream, $($arg)*) {
|
||||
Err(e) if e.kind() != ::std::io::ErrorKind::BrokenPipe => {
|
||||
::std::panic!("failed printing to stdout: {e}");
|
||||
}
|
||||
Err(_) | Ok(_) => {}
|
||||
}
|
||||
}
|
||||
}};
|
||||
}
|
||||
|
||||
/// Prints to [`stdout`][crate::stdout], with a newline.
|
||||
///
|
||||
/// On all platforms, the newline is the LINE FEED character (`\n`/`U+000A`) alone
|
||||
/// (no additional CARRIAGE RETURN (`\r`/`U+000D`)).
|
||||
///
|
||||
/// This macro uses the same syntax as [`format!`], but writes to the standard output instead.
|
||||
/// See [`std::fmt`] for more information.
|
||||
///
|
||||
/// **NOTE:** The `println!` macro will lock the standard output on each call. If you call
|
||||
/// `println!` within a hot loop, this behavior may be the bottleneck of the loop.
|
||||
/// To avoid this, lock stdout with [`AutoStream::lock`][crate::AutoStream::lock]:
|
||||
/// ```
|
||||
/// # #[cfg(feature = "auto")] {
|
||||
/// use std::io::Write as _;
|
||||
///
|
||||
/// let mut lock = anstream::stdout().lock();
|
||||
/// writeln!(lock, "hello world").unwrap();
|
||||
/// # }
|
||||
/// ```
|
||||
///
|
||||
/// Use `println!` only for the primary output of your program. Use
|
||||
/// [`eprintln!`] instead to print error and progress messages.
|
||||
///
|
||||
/// **NOTE:** Not all `println!` calls will be captured in tests like [`std::println!`]
|
||||
/// - Capturing will automatically be activated in test binaries
|
||||
/// - Otherwise, only when the `test` feature is enabled
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// Panics if writing to `stdout` fails for any reason **except** broken pipe.
|
||||
///
|
||||
/// Writing to non-blocking stdout can cause an error, which will lead
|
||||
/// this macro to panic.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// ```
|
||||
/// # #[cfg(feature = "auto")] {
|
||||
/// use anstream::println;
|
||||
///
|
||||
/// println!(); // prints just a newline
|
||||
/// println!("hello there!");
|
||||
/// println!("format {} arguments", "some");
|
||||
/// let local_variable = "some";
|
||||
/// println!("format {local_variable} arguments");
|
||||
/// # }
|
||||
/// ```
|
||||
#[cfg(feature = "auto")]
|
||||
#[macro_export]
|
||||
macro_rules! println {
|
||||
() => {
|
||||
$crate::print!("\n")
|
||||
};
|
||||
($($arg:tt)*) => {{
|
||||
if cfg!(any(feature = "test", test)) {
|
||||
use std::io::Write as _;
|
||||
|
||||
let stdio = std::io::stdout();
|
||||
let choice = $crate::AutoStream::choice(&stdio);
|
||||
let buffer = Vec::new();
|
||||
let mut stream = $crate::AutoStream::new(buffer, choice);
|
||||
// Ignore errors rather than panic
|
||||
let _ = ::std::write!(&mut stream, $($arg)*);
|
||||
let buffer = stream.into_inner();
|
||||
// Should be UTF-8 but not wanting to panic
|
||||
let buffer = String::from_utf8_lossy(&buffer);
|
||||
::std::println!("{}", buffer)
|
||||
} else {
|
||||
use std::io::Write as _;
|
||||
|
||||
let mut stream = $crate::stdout();
|
||||
match ::std::writeln!(&mut stream, $($arg)*) {
|
||||
Err(e) if e.kind() != ::std::io::ErrorKind::BrokenPipe => {
|
||||
::std::panic!("failed printing to stdout: {e}");
|
||||
}
|
||||
Err(_) | Ok(_) => {}
|
||||
}
|
||||
}
|
||||
}};
|
||||
}
|
||||
|
||||
/// Prints to [`stderr`][crate::stderr].
|
||||
///
|
||||
/// Equivalent to the [`print!`] macro, except that output goes to
|
||||
/// `stderr` instead of `stdout`. See [`print!`] for
|
||||
/// example usage.
|
||||
///
|
||||
/// Use `eprint!` only for error and progress messages. Use `print!`
|
||||
/// instead for the primary output of your program.
|
||||
///
|
||||
/// **NOTE:** Not all `eprint!` calls will be captured in tests like [`std::eprint!`]
|
||||
/// - Capturing will automatically be activated in test binaries
|
||||
/// - Otherwise, only when the `test` feature is enabled
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// Panics if writing to `stderr` fails for any reason **except** broken pipe.
|
||||
///
|
||||
/// Writing to non-blocking stdout can cause an error, which will lead
|
||||
/// this macro to panic.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// ```
|
||||
/// # #[cfg(feature = "auto")] {
|
||||
/// use anstream::eprint;
|
||||
///
|
||||
/// eprint!("Error: Could not complete task");
|
||||
/// # }
|
||||
/// ```
|
||||
#[cfg(feature = "auto")]
|
||||
#[macro_export]
|
||||
macro_rules! eprint {
|
||||
($($arg:tt)*) => {{
|
||||
if cfg!(any(feature = "test", test)) {
|
||||
use std::io::Write as _;
|
||||
|
||||
let stdio = std::io::stderr();
|
||||
let choice = $crate::AutoStream::choice(&stdio);
|
||||
let buffer = Vec::new();
|
||||
let mut stream = $crate::AutoStream::new(buffer, choice);
|
||||
// Ignore errors rather than panic
|
||||
let _ = ::std::write!(&mut stream, $($arg)*);
|
||||
let buffer = stream.into_inner();
|
||||
// Should be UTF-8 but not wanting to panic
|
||||
let buffer = String::from_utf8_lossy(&buffer);
|
||||
::std::eprint!("{}", buffer)
|
||||
} else {
|
||||
use std::io::Write as _;
|
||||
|
||||
let mut stream = $crate::stderr();
|
||||
match ::std::write!(&mut stream, $($arg)*) {
|
||||
Err(e) if e.kind() != ::std::io::ErrorKind::BrokenPipe => {
|
||||
::std::panic!("failed printing to stdout: {e}");
|
||||
}
|
||||
Err(_) | Ok(_) => {}
|
||||
}
|
||||
}
|
||||
}};
|
||||
}
|
||||
|
||||
/// Prints to [`stderr`][crate::stderr], with a newline.
|
||||
///
|
||||
/// Equivalent to the [`println!`] macro, except that output goes to
|
||||
/// `stderr` instead of `stdout`. See [`println!`] for
|
||||
/// example usage.
|
||||
///
|
||||
/// Use `eprintln!` only for error and progress messages. Use `println!`
|
||||
/// instead for the primary output of your program.
|
||||
///
|
||||
/// **NOTE:** Not all `eprintln!` calls will be captured in tests like [`std::eprintln!`]
|
||||
/// - Capturing will automatically be activated in test binaries
|
||||
/// - Otherwise, only when the `test` feature is enabled
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// Panics if writing to `stderr` fails for any reason **except** broken pipe.
|
||||
///
|
||||
/// Writing to non-blocking stdout can cause an error, which will lead
|
||||
/// this macro to panic.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// ```
|
||||
/// # #[cfg(feature = "auto")] {
|
||||
/// use anstream::eprintln;
|
||||
///
|
||||
/// eprintln!("Error: Could not complete task");
|
||||
/// # }
|
||||
/// ```
|
||||
#[cfg(feature = "auto")]
|
||||
#[macro_export]
|
||||
macro_rules! eprintln {
|
||||
() => {
|
||||
$crate::eprint!("\n")
|
||||
};
|
||||
($($arg:tt)*) => {{
|
||||
if cfg!(any(feature = "test", test)) {
|
||||
use std::io::Write as _;
|
||||
|
||||
let stdio = std::io::stderr();
|
||||
let choice = $crate::AutoStream::choice(&stdio);
|
||||
let buffer = Vec::new();
|
||||
let mut stream = $crate::AutoStream::new(buffer, choice);
|
||||
// Ignore errors rather than panic
|
||||
let _ = ::std::write!(&mut stream, $($arg)*);
|
||||
let buffer = stream.into_inner();
|
||||
// Should be UTF-8 but not wanting to panic
|
||||
let buffer = String::from_utf8_lossy(&buffer);
|
||||
::std::eprintln!("{}", buffer)
|
||||
} else {
|
||||
use std::io::Write as _;
|
||||
|
||||
let mut stream = $crate::stderr();
|
||||
match ::std::writeln!(&mut stream, $($arg)*) {
|
||||
Err(e) if e.kind() != ::std::io::ErrorKind::BrokenPipe => {
|
||||
::std::panic!("failed printing to stdout: {e}");
|
||||
}
|
||||
Err(_) | Ok(_) => {}
|
||||
}
|
||||
}
|
||||
}};
|
||||
}
|
||||
|
||||
/// Panics the current thread.
|
||||
///
|
||||
/// This allows a program to terminate immediately and provide feedback
|
||||
/// to the caller of the program.
|
||||
///
|
||||
/// This macro is the perfect way to assert conditions in example code and in
|
||||
/// tests. `panic!` is closely tied with the `unwrap` method of both
|
||||
/// [`Option`][ounwrap] and [`Result`][runwrap] enums. Both implementations call
|
||||
/// `panic!` when they are set to [`None`] or [`Err`] variants.
|
||||
///
|
||||
/// When using `panic!()` you can specify a string payload, that is built using
|
||||
/// the [`format!`] syntax. That payload is used when injecting the panic into
|
||||
/// the calling Rust thread, causing the thread to panic entirely.
|
||||
///
|
||||
/// The behavior of the default `std` hook, i.e. the code that runs directly
|
||||
/// after the panic is invoked, is to print the message payload to
|
||||
/// `stderr` along with the file/line/column information of the `panic!()`
|
||||
/// call. You can override the panic hook using [`std::panic::set_hook()`].
|
||||
/// Inside the hook a panic can be accessed as a `&dyn Any + Send`,
|
||||
/// which contains either a `&str` or `String` for regular `panic!()` invocations.
|
||||
/// To panic with a value of another other type, [`panic_any`] can be used.
|
||||
///
|
||||
/// See also the macro [`compile_error!`], for raising errors during compilation.
|
||||
///
|
||||
/// # When to use `panic!` vs `Result`
|
||||
///
|
||||
/// The Rust language provides two complementary systems for constructing /
|
||||
/// representing, reporting, propagating, reacting to, and discarding errors. These
|
||||
/// responsibilities are collectively known as "error handling." `panic!` and
|
||||
/// `Result` are similar in that they are each the primary interface of their
|
||||
/// respective error handling systems; however, the meaning these interfaces attach
|
||||
/// to their errors and the responsibilities they fulfill within their respective
|
||||
/// error handling systems differ.
|
||||
///
|
||||
/// The `panic!` macro is used to construct errors that represent a bug that has
|
||||
/// been detected in your program. With `panic!` you provide a message that
|
||||
/// describes the bug and the language then constructs an error with that message,
|
||||
/// reports it, and propagates it for you.
|
||||
///
|
||||
/// `Result` on the other hand is used to wrap other types that represent either
|
||||
/// the successful result of some computation, `Ok(T)`, or error types that
|
||||
/// represent an anticipated runtime failure mode of that computation, `Err(E)`.
|
||||
/// `Result` is used alongside user defined types which represent the various
|
||||
/// anticipated runtime failure modes that the associated computation could
|
||||
/// encounter. `Result` must be propagated manually, often with the the help of the
|
||||
/// `?` operator and `Try` trait, and they must be reported manually, often with
|
||||
/// the help of the `Error` trait.
|
||||
///
|
||||
/// For more detailed information about error handling check out the [book] or the
|
||||
/// [`std::result`] module docs.
|
||||
///
|
||||
/// [ounwrap]: Option::unwrap
|
||||
/// [runwrap]: Result::unwrap
|
||||
/// [`std::panic::set_hook()`]: ../std/panic/fn.set_hook.html
|
||||
/// [`panic_any`]: ../std/panic/fn.panic_any.html
|
||||
/// [`Box`]: ../std/boxed/struct.Box.html
|
||||
/// [`Any`]: crate::any::Any
|
||||
/// [`format!`]: ../std/macro.format.html
|
||||
/// [book]: ../book/ch09-00-error-handling.html
|
||||
/// [`std::result`]: ../std/result/index.html
|
||||
///
|
||||
/// # Current implementation
|
||||
///
|
||||
/// If the main thread panics it will terminate all your threads and end your
|
||||
/// program with code `101`.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// ```should_panic
|
||||
/// # #![allow(unreachable_code)]
|
||||
/// use anstream::panic;
|
||||
/// panic!();
|
||||
/// panic!("this is a terrible mistake!");
|
||||
/// panic!("this is a {} {message}", "fancy", message = "message");
|
||||
/// ```
|
||||
#[cfg(feature = "auto")]
|
||||
#[macro_export]
|
||||
macro_rules! panic {
|
||||
() => {
|
||||
::std::panic!()
|
||||
};
|
||||
($($arg:tt)*) => {{
|
||||
use std::io::Write as _;
|
||||
|
||||
let panic_stream = std::io::stderr();
|
||||
let choice = $crate::AutoStream::choice(&panic_stream);
|
||||
let buffer = Vec::new();
|
||||
let mut stream = $crate::AutoStream::new(buffer, choice);
|
||||
// Ignore errors rather than panic
|
||||
let _ = ::std::write!(&mut stream, $($arg)*);
|
||||
let buffer = stream.into_inner();
|
||||
// Should be UTF-8 but not wanting to panic
|
||||
let buffer = String::from_utf8_lossy(&buffer).into_owned();
|
||||
::std::panic!("{}", buffer)
|
||||
}};
|
||||
}
|
261
vendor/anstream/src/stream.rs
vendored
Normal file
261
vendor/anstream/src/stream.rs
vendored
Normal file
@ -0,0 +1,261 @@
|
||||
//! Higher-level traits to describe writeable streams
|
||||
|
||||
/// Required functionality for underlying [`std::io::Write`] for adaptation
|
||||
#[cfg(not(all(windows, feature = "wincon")))]
|
||||
pub trait RawStream: std::io::Write + IsTerminal + private::Sealed {}
|
||||
|
||||
/// Required functionality for underlying [`std::io::Write`] for adaptation
|
||||
#[cfg(all(windows, feature = "wincon"))]
|
||||
pub trait RawStream:
|
||||
std::io::Write + IsTerminal + anstyle_wincon::WinconStream + private::Sealed
|
||||
{
|
||||
}
|
||||
|
||||
impl RawStream for std::io::Stdout {}
|
||||
|
||||
impl RawStream for std::io::StdoutLock<'_> {}
|
||||
|
||||
impl RawStream for &'_ mut std::io::StdoutLock<'_> {}
|
||||
|
||||
impl RawStream for std::io::Stderr {}
|
||||
|
||||
impl RawStream for std::io::StderrLock<'_> {}
|
||||
|
||||
impl RawStream for &'_ mut std::io::StderrLock<'_> {}
|
||||
|
||||
impl RawStream for Box<dyn std::io::Write> {}
|
||||
|
||||
impl RawStream for &'_ mut Box<dyn std::io::Write> {}
|
||||
|
||||
impl RawStream for Vec<u8> {}
|
||||
|
||||
impl RawStream for &'_ mut Vec<u8> {}
|
||||
|
||||
impl RawStream for std::fs::File {}
|
||||
|
||||
impl RawStream for &'_ mut std::fs::File {}
|
||||
|
||||
#[allow(deprecated)]
|
||||
impl RawStream for crate::Buffer {}
|
||||
|
||||
#[allow(deprecated)]
|
||||
impl RawStream for &'_ mut crate::Buffer {}
|
||||
|
||||
pub trait IsTerminal: private::Sealed {
|
||||
fn is_terminal(&self) -> bool;
|
||||
}
|
||||
|
||||
impl IsTerminal for std::io::Stdout {
|
||||
#[inline]
|
||||
fn is_terminal(&self) -> bool {
|
||||
std::io::IsTerminal::is_terminal(self)
|
||||
}
|
||||
}
|
||||
|
||||
impl IsTerminal for std::io::StdoutLock<'_> {
|
||||
#[inline]
|
||||
fn is_terminal(&self) -> bool {
|
||||
std::io::IsTerminal::is_terminal(self)
|
||||
}
|
||||
}
|
||||
|
||||
impl IsTerminal for &'_ mut std::io::StdoutLock<'_> {
|
||||
#[inline]
|
||||
fn is_terminal(&self) -> bool {
|
||||
(**self).is_terminal()
|
||||
}
|
||||
}
|
||||
|
||||
impl IsTerminal for std::io::Stderr {
|
||||
#[inline]
|
||||
fn is_terminal(&self) -> bool {
|
||||
std::io::IsTerminal::is_terminal(self)
|
||||
}
|
||||
}
|
||||
|
||||
impl IsTerminal for std::io::StderrLock<'_> {
|
||||
#[inline]
|
||||
fn is_terminal(&self) -> bool {
|
||||
std::io::IsTerminal::is_terminal(self)
|
||||
}
|
||||
}
|
||||
|
||||
impl IsTerminal for &'_ mut std::io::StderrLock<'_> {
|
||||
#[inline]
|
||||
fn is_terminal(&self) -> bool {
|
||||
(**self).is_terminal()
|
||||
}
|
||||
}
|
||||
|
||||
impl IsTerminal for Box<dyn std::io::Write> {
|
||||
#[inline]
|
||||
fn is_terminal(&self) -> bool {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
impl IsTerminal for &'_ mut Box<dyn std::io::Write> {
|
||||
#[inline]
|
||||
fn is_terminal(&self) -> bool {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
impl IsTerminal for Vec<u8> {
|
||||
#[inline]
|
||||
fn is_terminal(&self) -> bool {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
impl IsTerminal for &'_ mut Vec<u8> {
|
||||
#[inline]
|
||||
fn is_terminal(&self) -> bool {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
impl IsTerminal for std::fs::File {
|
||||
#[inline]
|
||||
fn is_terminal(&self) -> bool {
|
||||
std::io::IsTerminal::is_terminal(self)
|
||||
}
|
||||
}
|
||||
|
||||
impl IsTerminal for &'_ mut std::fs::File {
|
||||
#[inline]
|
||||
fn is_terminal(&self) -> bool {
|
||||
(**self).is_terminal()
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(deprecated)]
|
||||
impl IsTerminal for crate::Buffer {
|
||||
#[inline]
|
||||
fn is_terminal(&self) -> bool {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(deprecated)]
|
||||
impl IsTerminal for &'_ mut crate::Buffer {
|
||||
#[inline]
|
||||
fn is_terminal(&self) -> bool {
|
||||
(**self).is_terminal()
|
||||
}
|
||||
}
|
||||
|
||||
pub trait AsLockedWrite: private::Sealed {
|
||||
type Write<'w>: RawStream + 'w
|
||||
where
|
||||
Self: 'w;
|
||||
|
||||
fn as_locked_write(&mut self) -> Self::Write<'_>;
|
||||
}
|
||||
|
||||
impl AsLockedWrite for std::io::Stdout {
|
||||
type Write<'w> = std::io::StdoutLock<'w>;
|
||||
|
||||
#[inline]
|
||||
fn as_locked_write(&mut self) -> Self::Write<'_> {
|
||||
self.lock()
|
||||
}
|
||||
}
|
||||
|
||||
impl AsLockedWrite for std::io::StdoutLock<'static> {
|
||||
type Write<'w> = &'w mut Self;
|
||||
|
||||
#[inline]
|
||||
fn as_locked_write(&mut self) -> Self::Write<'_> {
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
impl AsLockedWrite for std::io::Stderr {
|
||||
type Write<'w> = std::io::StderrLock<'w>;
|
||||
|
||||
#[inline]
|
||||
fn as_locked_write(&mut self) -> Self::Write<'_> {
|
||||
self.lock()
|
||||
}
|
||||
}
|
||||
|
||||
impl AsLockedWrite for std::io::StderrLock<'static> {
|
||||
type Write<'w> = &'w mut Self;
|
||||
|
||||
#[inline]
|
||||
fn as_locked_write(&mut self) -> Self::Write<'_> {
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
impl AsLockedWrite for Box<dyn std::io::Write> {
|
||||
type Write<'w> = &'w mut Self;
|
||||
|
||||
#[inline]
|
||||
fn as_locked_write(&mut self) -> Self::Write<'_> {
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
impl AsLockedWrite for Vec<u8> {
|
||||
type Write<'w> = &'w mut Self;
|
||||
|
||||
#[inline]
|
||||
fn as_locked_write(&mut self) -> Self::Write<'_> {
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
impl AsLockedWrite for std::fs::File {
|
||||
type Write<'w> = &'w mut Self;
|
||||
|
||||
#[inline]
|
||||
fn as_locked_write(&mut self) -> Self::Write<'_> {
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(deprecated)]
|
||||
impl AsLockedWrite for crate::Buffer {
|
||||
type Write<'w> = &'w mut Self;
|
||||
|
||||
#[inline]
|
||||
fn as_locked_write(&mut self) -> Self::Write<'_> {
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
mod private {
|
||||
pub trait Sealed {}
|
||||
|
||||
impl Sealed for std::io::Stdout {}
|
||||
|
||||
impl Sealed for std::io::StdoutLock<'_> {}
|
||||
|
||||
impl Sealed for &'_ mut std::io::StdoutLock<'_> {}
|
||||
|
||||
impl Sealed for std::io::Stderr {}
|
||||
|
||||
impl Sealed for std::io::StderrLock<'_> {}
|
||||
|
||||
impl Sealed for &'_ mut std::io::StderrLock<'_> {}
|
||||
|
||||
impl Sealed for Box<dyn std::io::Write> {}
|
||||
|
||||
impl Sealed for &'_ mut Box<dyn std::io::Write> {}
|
||||
|
||||
impl Sealed for Vec<u8> {}
|
||||
|
||||
impl Sealed for &'_ mut Vec<u8> {}
|
||||
|
||||
impl Sealed for std::fs::File {}
|
||||
|
||||
impl Sealed for &'_ mut std::fs::File {}
|
||||
|
||||
#[allow(deprecated)]
|
||||
impl Sealed for crate::Buffer {}
|
||||
|
||||
#[allow(deprecated)]
|
||||
impl Sealed for &'_ mut crate::Buffer {}
|
||||
}
|
226
vendor/anstream/src/strip.rs
vendored
Normal file
226
vendor/anstream/src/strip.rs
vendored
Normal file
@ -0,0 +1,226 @@
|
||||
use crate::adapter::StripBytes;
|
||||
use crate::stream::AsLockedWrite;
|
||||
use crate::stream::IsTerminal;
|
||||
|
||||
/// Only pass printable data to the inner `Write`
|
||||
#[derive(Debug)]
|
||||
pub struct StripStream<S>
|
||||
where
|
||||
S: std::io::Write,
|
||||
{
|
||||
raw: S,
|
||||
state: StripBytes,
|
||||
}
|
||||
|
||||
impl<S> StripStream<S>
|
||||
where
|
||||
S: std::io::Write,
|
||||
{
|
||||
/// Only pass printable data to the inner `Write`
|
||||
#[inline]
|
||||
pub fn new(raw: S) -> Self {
|
||||
Self {
|
||||
raw,
|
||||
state: Default::default(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Get the wrapped [`std::io::Write`]
|
||||
#[inline]
|
||||
pub fn into_inner(self) -> S {
|
||||
self.raw
|
||||
}
|
||||
}
|
||||
|
||||
impl<S> StripStream<S>
|
||||
where
|
||||
S: std::io::Write,
|
||||
S: IsTerminal,
|
||||
{
|
||||
#[inline]
|
||||
pub fn is_terminal(&self) -> bool {
|
||||
self.raw.is_terminal()
|
||||
}
|
||||
}
|
||||
|
||||
impl StripStream<std::io::Stdout> {
|
||||
/// Get exclusive access to the `StripStream`
|
||||
///
|
||||
/// Why?
|
||||
/// - Faster performance when writing in a loop
|
||||
/// - Avoid other threads interleaving output with the current thread
|
||||
#[inline]
|
||||
pub fn lock(self) -> StripStream<std::io::StdoutLock<'static>> {
|
||||
StripStream {
|
||||
raw: self.raw.lock(),
|
||||
state: self.state,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl StripStream<std::io::Stderr> {
|
||||
/// Get exclusive access to the `StripStream`
|
||||
///
|
||||
/// Why?
|
||||
/// - Faster performance when writing in a loop
|
||||
/// - Avoid other threads interleaving output with the current thread
|
||||
#[inline]
|
||||
pub fn lock(self) -> StripStream<std::io::StderrLock<'static>> {
|
||||
StripStream {
|
||||
raw: self.raw.lock(),
|
||||
state: self.state,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<S> std::io::Write for StripStream<S>
|
||||
where
|
||||
S: std::io::Write,
|
||||
S: AsLockedWrite,
|
||||
{
|
||||
// Must forward all calls to ensure locking happens appropriately
|
||||
#[inline]
|
||||
fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
|
||||
write(&mut self.raw.as_locked_write(), &mut self.state, buf)
|
||||
}
|
||||
#[inline]
|
||||
fn write_vectored(&mut self, bufs: &[std::io::IoSlice<'_>]) -> std::io::Result<usize> {
|
||||
let buf = bufs
|
||||
.iter()
|
||||
.find(|b| !b.is_empty())
|
||||
.map(|b| &**b)
|
||||
.unwrap_or(&[][..]);
|
||||
self.write(buf)
|
||||
}
|
||||
// is_write_vectored: nightly only
|
||||
#[inline]
|
||||
fn flush(&mut self) -> std::io::Result<()> {
|
||||
self.raw.as_locked_write().flush()
|
||||
}
|
||||
#[inline]
|
||||
fn write_all(&mut self, buf: &[u8]) -> std::io::Result<()> {
|
||||
write_all(&mut self.raw.as_locked_write(), &mut self.state, buf)
|
||||
}
|
||||
// write_all_vectored: nightly only
|
||||
#[inline]
|
||||
fn write_fmt(&mut self, args: std::fmt::Arguments<'_>) -> std::io::Result<()> {
|
||||
write_fmt(&mut self.raw.as_locked_write(), &mut self.state, args)
|
||||
}
|
||||
}
|
||||
|
||||
fn write(
|
||||
raw: &mut dyn std::io::Write,
|
||||
state: &mut StripBytes,
|
||||
buf: &[u8],
|
||||
) -> std::io::Result<usize> {
|
||||
let initial_state = state.clone();
|
||||
|
||||
for printable in state.strip_next(buf) {
|
||||
let possible = printable.len();
|
||||
let written = raw.write(printable)?;
|
||||
if possible != written {
|
||||
let divergence = &printable[written..];
|
||||
let offset = offset_to(buf, divergence);
|
||||
let consumed = &buf[offset..];
|
||||
*state = initial_state;
|
||||
state.strip_next(consumed).last();
|
||||
return Ok(offset);
|
||||
}
|
||||
}
|
||||
Ok(buf.len())
|
||||
}
|
||||
|
||||
fn write_all(
|
||||
raw: &mut dyn std::io::Write,
|
||||
state: &mut StripBytes,
|
||||
buf: &[u8],
|
||||
) -> std::io::Result<()> {
|
||||
for printable in state.strip_next(buf) {
|
||||
raw.write_all(printable)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn write_fmt(
|
||||
raw: &mut dyn std::io::Write,
|
||||
state: &mut StripBytes,
|
||||
args: std::fmt::Arguments<'_>,
|
||||
) -> std::io::Result<()> {
|
||||
let write_all = |buf: &[u8]| write_all(raw, state, buf);
|
||||
crate::fmt::Adapter::new(write_all).write_fmt(args)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn offset_to(total: &[u8], subslice: &[u8]) -> usize {
|
||||
let total = total.as_ptr();
|
||||
let subslice = subslice.as_ptr();
|
||||
|
||||
debug_assert!(
|
||||
total <= subslice,
|
||||
"`Offset::offset_to` only accepts slices of `self`"
|
||||
);
|
||||
subslice as usize - total as usize
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::*;
|
||||
use proptest::prelude::*;
|
||||
use std::io::Write as _;
|
||||
|
||||
proptest! {
|
||||
#[test]
|
||||
#[cfg_attr(miri, ignore)] // See https://github.com/AltSysrq/proptest/issues/253
|
||||
fn write_all_no_escapes(s in "\\PC*") {
|
||||
let buffer = Vec::new();
|
||||
let mut stream = StripStream::new(buffer);
|
||||
stream.write_all(s.as_bytes()).unwrap();
|
||||
let buffer = stream.into_inner();
|
||||
let actual = std::str::from_utf8(buffer.as_ref()).unwrap();
|
||||
assert_eq!(s, actual);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg_attr(miri, ignore)] // See https://github.com/AltSysrq/proptest/issues/253
|
||||
fn write_byte_no_escapes(s in "\\PC*") {
|
||||
let buffer = Vec::new();
|
||||
let mut stream = StripStream::new(buffer);
|
||||
for byte in s.as_bytes() {
|
||||
stream.write_all(&[*byte]).unwrap();
|
||||
}
|
||||
let buffer = stream.into_inner();
|
||||
let actual = std::str::from_utf8(buffer.as_ref()).unwrap();
|
||||
assert_eq!(s, actual);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg_attr(miri, ignore)] // See https://github.com/AltSysrq/proptest/issues/253
|
||||
fn write_all_random(s in any::<Vec<u8>>()) {
|
||||
let buffer = Vec::new();
|
||||
let mut stream = StripStream::new(buffer);
|
||||
stream.write_all(s.as_slice()).unwrap();
|
||||
let buffer = stream.into_inner();
|
||||
if let Ok(actual) = std::str::from_utf8(buffer.as_ref()) {
|
||||
for char in actual.chars() {
|
||||
assert!(!char.is_ascii() || !char.is_control() || char.is_ascii_whitespace(), "{:?} -> {:?}: {:?}", String::from_utf8_lossy(&s), actual, char);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg_attr(miri, ignore)] // See https://github.com/AltSysrq/proptest/issues/253
|
||||
fn write_byte_random(s in any::<Vec<u8>>()) {
|
||||
let buffer = Vec::new();
|
||||
let mut stream = StripStream::new(buffer);
|
||||
for byte in s.as_slice() {
|
||||
stream.write_all(&[*byte]).unwrap();
|
||||
}
|
||||
let buffer = stream.into_inner();
|
||||
if let Ok(actual) = std::str::from_utf8(buffer.as_ref()) {
|
||||
for char in actual.chars() {
|
||||
assert!(!char.is_ascii() || !char.is_control() || char.is_ascii_whitespace(), "{:?} -> {:?}: {:?}", String::from_utf8_lossy(&s), actual, char);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
225
vendor/anstream/src/wincon.rs
vendored
Normal file
225
vendor/anstream/src/wincon.rs
vendored
Normal file
@ -0,0 +1,225 @@
|
||||
use crate::adapter::WinconBytes;
|
||||
use crate::stream::AsLockedWrite;
|
||||
use crate::stream::IsTerminal;
|
||||
|
||||
/// Only pass printable data to the inner `Write`
|
||||
#[cfg(feature = "wincon")] // here mostly for documentation purposes
|
||||
#[derive(Debug)]
|
||||
pub struct WinconStream<S>
|
||||
where
|
||||
S: anstyle_wincon::WinconStream,
|
||||
{
|
||||
raw: S,
|
||||
// `WinconBytes` is especially large compared to other variants of `AutoStream`, so boxing it
|
||||
// here so `AutoStream` doesn't have to discard one allocation and create another one when
|
||||
// calling `AutoStream::lock`
|
||||
state: Box<WinconBytes>,
|
||||
}
|
||||
|
||||
impl<S> WinconStream<S>
|
||||
where
|
||||
S: anstyle_wincon::WinconStream,
|
||||
{
|
||||
/// Only pass printable data to the inner `Write`
|
||||
#[inline]
|
||||
pub fn new(raw: S) -> Self {
|
||||
Self {
|
||||
raw,
|
||||
state: Default::default(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Get the wrapped [`anstyle_wincon::WinconStream`]
|
||||
#[inline]
|
||||
pub fn into_inner(self) -> S {
|
||||
self.raw
|
||||
}
|
||||
}
|
||||
|
||||
impl<S> WinconStream<S>
|
||||
where
|
||||
S: anstyle_wincon::WinconStream,
|
||||
S: IsTerminal,
|
||||
{
|
||||
#[inline]
|
||||
pub fn is_terminal(&self) -> bool {
|
||||
self.raw.is_terminal()
|
||||
}
|
||||
}
|
||||
|
||||
impl WinconStream<std::io::Stdout> {
|
||||
/// Get exclusive access to the `WinconStream`
|
||||
///
|
||||
/// Why?
|
||||
/// - Faster performance when writing in a loop
|
||||
/// - Avoid other threads interleaving output with the current thread
|
||||
#[inline]
|
||||
pub fn lock(self) -> WinconStream<std::io::StdoutLock<'static>> {
|
||||
WinconStream {
|
||||
raw: self.raw.lock(),
|
||||
state: self.state,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl WinconStream<std::io::Stderr> {
|
||||
/// Get exclusive access to the `WinconStream`
|
||||
///
|
||||
/// Why?
|
||||
/// - Faster performance when writing in a loop
|
||||
/// - Avoid other threads interleaving output with the current thread
|
||||
#[inline]
|
||||
pub fn lock(self) -> WinconStream<std::io::StderrLock<'static>> {
|
||||
WinconStream {
|
||||
raw: self.raw.lock(),
|
||||
state: self.state,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<S> std::io::Write for WinconStream<S>
|
||||
where
|
||||
S: anstyle_wincon::WinconStream,
|
||||
S: AsLockedWrite,
|
||||
{
|
||||
// Must forward all calls to ensure locking happens appropriately
|
||||
#[inline]
|
||||
fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
|
||||
write(&mut self.raw.as_locked_write(), &mut self.state, buf)
|
||||
}
|
||||
#[inline]
|
||||
fn write_vectored(&mut self, bufs: &[std::io::IoSlice<'_>]) -> std::io::Result<usize> {
|
||||
let buf = bufs
|
||||
.iter()
|
||||
.find(|b| !b.is_empty())
|
||||
.map(|b| &**b)
|
||||
.unwrap_or(&[][..]);
|
||||
self.write(buf)
|
||||
}
|
||||
// is_write_vectored: nightly only
|
||||
#[inline]
|
||||
fn flush(&mut self) -> std::io::Result<()> {
|
||||
self.raw.as_locked_write().flush()
|
||||
}
|
||||
#[inline]
|
||||
fn write_all(&mut self, buf: &[u8]) -> std::io::Result<()> {
|
||||
write_all(&mut self.raw.as_locked_write(), &mut self.state, buf)
|
||||
}
|
||||
// write_all_vectored: nightly only
|
||||
#[inline]
|
||||
fn write_fmt(&mut self, args: std::fmt::Arguments<'_>) -> std::io::Result<()> {
|
||||
write_fmt(&mut self.raw.as_locked_write(), &mut self.state, args)
|
||||
}
|
||||
}
|
||||
|
||||
fn write(
|
||||
raw: &mut dyn anstyle_wincon::WinconStream,
|
||||
state: &mut WinconBytes,
|
||||
buf: &[u8],
|
||||
) -> std::io::Result<usize> {
|
||||
for (style, printable) in state.extract_next(buf) {
|
||||
let fg = style.get_fg_color().and_then(cap_wincon_color);
|
||||
let bg = style.get_bg_color().and_then(cap_wincon_color);
|
||||
let written = raw.write_colored(fg, bg, printable.as_bytes())?;
|
||||
let possible = printable.len();
|
||||
if possible != written {
|
||||
// HACK: Unsupported atm
|
||||
break;
|
||||
}
|
||||
}
|
||||
Ok(buf.len())
|
||||
}
|
||||
|
||||
fn write_all(
|
||||
raw: &mut dyn anstyle_wincon::WinconStream,
|
||||
state: &mut WinconBytes,
|
||||
buf: &[u8],
|
||||
) -> std::io::Result<()> {
|
||||
for (style, printable) in state.extract_next(buf) {
|
||||
let mut buf = printable.as_bytes();
|
||||
let fg = style.get_fg_color().and_then(cap_wincon_color);
|
||||
let bg = style.get_bg_color().and_then(cap_wincon_color);
|
||||
while !buf.is_empty() {
|
||||
match raw.write_colored(fg, bg, buf) {
|
||||
Ok(0) => {
|
||||
return Err(std::io::Error::new(
|
||||
std::io::ErrorKind::WriteZero,
|
||||
"failed to write whole buffer",
|
||||
));
|
||||
}
|
||||
Ok(n) => buf = &buf[n..],
|
||||
Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {}
|
||||
Err(e) => return Err(e),
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn write_fmt(
|
||||
raw: &mut dyn anstyle_wincon::WinconStream,
|
||||
state: &mut WinconBytes,
|
||||
args: std::fmt::Arguments<'_>,
|
||||
) -> std::io::Result<()> {
|
||||
let write_all = |buf: &[u8]| write_all(raw, state, buf);
|
||||
crate::fmt::Adapter::new(write_all).write_fmt(args)
|
||||
}
|
||||
|
||||
fn cap_wincon_color(color: anstyle::Color) -> Option<anstyle::AnsiColor> {
|
||||
match color {
|
||||
anstyle::Color::Ansi(c) => Some(c),
|
||||
anstyle::Color::Ansi256(c) => c.into_ansi(),
|
||||
anstyle::Color::Rgb(_) => None,
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::*;
|
||||
use proptest::prelude::*;
|
||||
use std::io::Write as _;
|
||||
|
||||
proptest! {
|
||||
#[test]
|
||||
#[cfg_attr(miri, ignore)] // See https://github.com/AltSysrq/proptest/issues/253
|
||||
fn write_all_no_escapes(s in "\\PC*") {
|
||||
let buffer = Vec::new();
|
||||
let mut stream = WinconStream::new(buffer);
|
||||
stream.write_all(s.as_bytes()).unwrap();
|
||||
let buffer = stream.into_inner();
|
||||
let actual = std::str::from_utf8(buffer.as_ref()).unwrap();
|
||||
assert_eq!(s, actual);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg_attr(miri, ignore)] // See https://github.com/AltSysrq/proptest/issues/253
|
||||
fn write_byte_no_escapes(s in "\\PC*") {
|
||||
let buffer = Vec::new();
|
||||
let mut stream = WinconStream::new(buffer);
|
||||
for byte in s.as_bytes() {
|
||||
stream.write_all(&[*byte]).unwrap();
|
||||
}
|
||||
let buffer = stream.into_inner();
|
||||
let actual = std::str::from_utf8(buffer.as_ref()).unwrap();
|
||||
assert_eq!(s, actual);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg_attr(miri, ignore)] // See https://github.com/AltSysrq/proptest/issues/253
|
||||
fn write_all_random(s in any::<Vec<u8>>()) {
|
||||
let buffer = Vec::new();
|
||||
let mut stream = WinconStream::new(buffer);
|
||||
stream.write_all(s.as_slice()).unwrap();
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg_attr(miri, ignore)] // See https://github.com/AltSysrq/proptest/issues/253
|
||||
fn write_byte_random(s in any::<Vec<u8>>()) {
|
||||
let buffer = Vec::new();
|
||||
let mut stream = WinconStream::new(buffer);
|
||||
for byte in s.as_slice() {
|
||||
stream.write_all(&[*byte]).unwrap();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
1
vendor/anstyle-parse/.cargo-checksum.json
vendored
Normal file
1
vendor/anstyle-parse/.cargo-checksum.json
vendored
Normal file
@ -0,0 +1 @@
|
||||
{"files":{"Cargo.lock":"7f68b5328c460caf1d2198b10fe1761e5f0282262f92d04076b30b25539970b0","Cargo.toml":"2834f39b7169c03b03da1e209f56133783ce00ea64d5f2c14381d93984ca20bf","LICENSE-APACHE":"b40930bbcf80744c86c46a12bc9da056641d722716c378f5659b9e555ef833e1","LICENSE-MIT":"c1d4bc00896473e0109ccb4c3c7d21addb55a4ff1a644be204dcfce26612af2a","README.md":"abc82171d436ee0eb221838e8d21a21a2e392504e87f0c130b5eca6a35671e1e","benches/parse.rs":"336c808d51c90db2497fa87e571df7f71c844a1b09be88839fe4255066c632f4","examples/parselog.rs":"58b7db739deed701aa0ab386d0d0c1772511b8aed1c08d31ec5b35a1c8cd4321","src/lib.rs":"c89f2afa0e982276dc47ca8d8a76d47516aa39aa9d3354254c87fdbf2f8ef4cc","src/params.rs":"8cfef4e2ab1961ca2d9f210da553fc6ac64bb6dbd03321f0ee7d6089ab45389c","src/state/codegen.rs":"8530124c8f998f391e47950f130590376321dcade810990f4312c3b1c0a61968","src/state/definitions.rs":"dc3dbb3244def74430a72b0108f019e22cc02e0ae5f563ee14d38300ff82b814","src/state/mod.rs":"be07c2ea393a971dd54117dc2ce8a3ffb5b803cb557ab468389b74570855fa37","src/state/table.rs":"673b7e9242c5248efc076086cc6923578ec2f059c0c26da21363528e20e4285c"},"package":"c75ac65da39e5fe5ab759307499ddad880d724eed2f6ce5b5e8a26f4f387928c"}
|
1202
vendor/anstyle-parse/Cargo.lock
generated
vendored
Normal file
1202
vendor/anstyle-parse/Cargo.lock
generated
vendored
Normal file
File diff suppressed because it is too large
Load Diff
108
vendor/anstyle-parse/Cargo.toml
vendored
Normal file
108
vendor/anstyle-parse/Cargo.toml
vendored
Normal file
@ -0,0 +1,108 @@
|
||||
# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO
|
||||
#
|
||||
# When uploading crates to the registry Cargo will automatically
|
||||
# "normalize" Cargo.toml files for maximal compatibility
|
||||
# with all versions of Cargo and also rewrite `path` dependencies
|
||||
# to registry (e.g., crates.io) dependencies.
|
||||
#
|
||||
# If you are reading this file be aware that the original Cargo.toml
|
||||
# will likely look very different (and much more reasonable).
|
||||
# See Cargo.toml.orig for the original contents.
|
||||
|
||||
[package]
|
||||
edition = "2021"
|
||||
rust-version = "1.70.0"
|
||||
name = "anstyle-parse"
|
||||
version = "0.2.3"
|
||||
include = [
|
||||
"build.rs",
|
||||
"src/**/*",
|
||||
"Cargo.toml",
|
||||
"Cargo.lock",
|
||||
"LICENSE*",
|
||||
"README.md",
|
||||
"benches/**/*",
|
||||
"examples/**/*",
|
||||
]
|
||||
description = "Parse ANSI Style Escapes"
|
||||
homepage = "https://github.com/rust-cli/anstyle"
|
||||
readme = "README.md"
|
||||
keywords = [
|
||||
"ansi",
|
||||
"terminal",
|
||||
"color",
|
||||
"vte",
|
||||
]
|
||||
categories = ["command-line-interface"]
|
||||
license = "MIT OR Apache-2.0"
|
||||
repository = "https://github.com/rust-cli/anstyle.git"
|
||||
|
||||
[[package.metadata.release.pre-release-replacements]]
|
||||
file = "CHANGELOG.md"
|
||||
min = 1
|
||||
replace = "{{version}}"
|
||||
search = "Unreleased"
|
||||
|
||||
[[package.metadata.release.pre-release-replacements]]
|
||||
exactly = 1
|
||||
file = "CHANGELOG.md"
|
||||
replace = "...{{tag_name}}"
|
||||
search = '\.\.\.HEAD'
|
||||
|
||||
[[package.metadata.release.pre-release-replacements]]
|
||||
file = "CHANGELOG.md"
|
||||
min = 1
|
||||
replace = "{{date}}"
|
||||
search = "ReleaseDate"
|
||||
|
||||
[[package.metadata.release.pre-release-replacements]]
|
||||
exactly = 1
|
||||
file = "CHANGELOG.md"
|
||||
replace = """
|
||||
<!-- next-header -->
|
||||
## [Unreleased] - ReleaseDate
|
||||
"""
|
||||
search = "<!-- next-header -->"
|
||||
|
||||
[[package.metadata.release.pre-release-replacements]]
|
||||
exactly = 1
|
||||
file = "CHANGELOG.md"
|
||||
replace = """
|
||||
<!-- next-url -->
|
||||
[Unreleased]: https://github.com/rust-cli/anstyle/compare/{{tag_name}}...HEAD"""
|
||||
search = "<!-- next-url -->"
|
||||
|
||||
[[bench]]
|
||||
name = "parse"
|
||||
harness = false
|
||||
|
||||
[dependencies.arrayvec]
|
||||
version = "0.7.2"
|
||||
optional = true
|
||||
default-features = false
|
||||
|
||||
[dependencies.utf8parse]
|
||||
version = "0.2.1"
|
||||
optional = true
|
||||
|
||||
[dev-dependencies.codegenrs]
|
||||
version = "3.0.1"
|
||||
default-features = false
|
||||
|
||||
[dev-dependencies.criterion]
|
||||
version = "0.5.1"
|
||||
|
||||
[dev-dependencies.proptest]
|
||||
version = "1.4.0"
|
||||
|
||||
[dev-dependencies.snapbox]
|
||||
version = "0.4.14"
|
||||
features = ["path"]
|
||||
|
||||
[dev-dependencies.vte_generate_state_changes]
|
||||
version = "0.1.1"
|
||||
|
||||
[features]
|
||||
core = ["dep:arrayvec"]
|
||||
default = ["utf8"]
|
||||
utf8 = ["dep:utf8parse"]
|
201
vendor/anstyle-parse/LICENSE-APACHE
vendored
Normal file
201
vendor/anstyle-parse/LICENSE-APACHE
vendored
Normal file
@ -0,0 +1,201 @@
|
||||
Apache License
|
||||
Version 2.0, January 2004
|
||||
http://www.apache.org/licenses/
|
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
1. Definitions.
|
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction,
|
||||
and distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by
|
||||
the copyright owner that is granting the License.
|
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all
|
||||
other entities that control, are controlled by, or are under common
|
||||
control with that entity. For the purposes of this definition,
|
||||
"control" means (i) the power, direct or indirect, to cause the
|
||||
direction or management of such entity, whether by contract or
|
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity
|
||||
exercising permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications,
|
||||
including but not limited to software source code, documentation
|
||||
source, and configuration files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical
|
||||
transformation or translation of a Source form, including but
|
||||
not limited to compiled object code, generated documentation,
|
||||
and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or
|
||||
Object form, made available under the License, as indicated by a
|
||||
copyright notice that is included in or attached to the work
|
||||
(an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object
|
||||
form, that is based on (or derived from) the Work and for which the
|
||||
editorial revisions, annotations, elaborations, or other modifications
|
||||
represent, as a whole, an original work of authorship. For the purposes
|
||||
of this License, Derivative Works shall not include works that remain
|
||||
separable from, or merely link (or bind by name) to the interfaces of,
|
||||
the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including
|
||||
the original version of the Work and any modifications or additions
|
||||
to that Work or Derivative Works thereof, that is intentionally
|
||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||
or by an individual or Legal Entity authorized to submit on behalf of
|
||||
the copyright owner. For the purposes of this definition, "submitted"
|
||||
means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems,
|
||||
and issue tracking systems that are managed by, or on behalf of, the
|
||||
Licensor for the purpose of discussing and improving the Work, but
|
||||
excluding communication that is conspicuously marked or otherwise
|
||||
designated in writing by the copyright owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||
on behalf of whom a Contribution has been received by Licensor and
|
||||
subsequently incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
copyright license to reproduce, prepare Derivative Works of,
|
||||
publicly display, publicly perform, sublicense, and distribute the
|
||||
Work and such Derivative Works in Source or Object form.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
(except as stated in this section) patent license to make, have made,
|
||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||
where such license applies only to those patent claims licensable
|
||||
by such Contributor that are necessarily infringed by their
|
||||
Contribution(s) alone or by combination of their Contribution(s)
|
||||
with the Work to which such Contribution(s) was submitted. If You
|
||||
institute patent litigation against any entity (including a
|
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||
or a Contribution incorporated within the Work constitutes direct
|
||||
or contributory patent infringement, then any patent licenses
|
||||
granted to You under this License for that Work shall terminate
|
||||
as of the date such litigation is filed.
|
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the
|
||||
Work or Derivative Works thereof in any medium, with or without
|
||||
modifications, and in Source or Object form, provided that You
|
||||
meet the following conditions:
|
||||
|
||||
(a) You must give any other recipients of the Work or
|
||||
Derivative Works a copy of this License; and
|
||||
|
||||
(b) You must cause any modified files to carry prominent notices
|
||||
stating that You changed the files; and
|
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works
|
||||
that You distribute, all copyright, patent, trademark, and
|
||||
attribution notices from the Source form of the Work,
|
||||
excluding those notices that do not pertain to any part of
|
||||
the Derivative Works; and
|
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its
|
||||
distribution, then any Derivative Works that You distribute must
|
||||
include a readable copy of the attribution notices contained
|
||||
within such NOTICE file, excluding those notices that do not
|
||||
pertain to any part of the Derivative Works, in at least one
|
||||
of the following places: within a NOTICE text file distributed
|
||||
as part of the Derivative Works; within the Source form or
|
||||
documentation, if provided along with the Derivative Works; or,
|
||||
within a display generated by the Derivative Works, if and
|
||||
wherever such third-party notices normally appear. The contents
|
||||
of the NOTICE file are for informational purposes only and
|
||||
do not modify the License. You may add Your own attribution
|
||||
notices within Derivative Works that You distribute, alongside
|
||||
or as an addendum to the NOTICE text from the Work, provided
|
||||
that such additional attribution notices cannot be construed
|
||||
as modifying the License.
|
||||
|
||||
You may add Your own copyright statement to Your modifications and
|
||||
may provide additional or different license terms and conditions
|
||||
for use, reproduction, or distribution of Your modifications, or
|
||||
for any such Derivative Works as a whole, provided Your use,
|
||||
reproduction, and distribution of the Work otherwise complies with
|
||||
the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||
any Contribution intentionally submitted for inclusion in the Work
|
||||
by You to the Licensor shall be under the terms and conditions of
|
||||
this License, without any additional terms or conditions.
|
||||
Notwithstanding the above, nothing herein shall supersede or modify
|
||||
the terms of any separate license agreement you may have executed
|
||||
with Licensor regarding such Contributions.
|
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade
|
||||
names, trademarks, service marks, or product names of the Licensor,
|
||||
except as required for reasonable and customary use in describing the
|
||||
origin of the Work and reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||
agreed to in writing, Licensor provides the Work (and each
|
||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
implied, including, without limitation, any warranties or conditions
|
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Work and assume any
|
||||
risks associated with Your exercise of permissions under this License.
|
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory,
|
||||
whether in tort (including negligence), contract, or otherwise,
|
||||
unless required by applicable law (such as deliberate and grossly
|
||||
negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special,
|
||||
incidental, or consequential damages of any character arising as a
|
||||
result of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill,
|
||||
work stoppage, computer failure or malfunction, or any and all
|
||||
other commercial damages or losses), even if such Contributor
|
||||
has been advised of the possibility of such damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing
|
||||
the Work or Derivative Works thereof, You may choose to offer,
|
||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||
or other liability obligations and/or rights consistent with this
|
||||
License. However, in accepting such obligations, You may act only
|
||||
on Your own behalf and on Your sole responsibility, not on behalf
|
||||
of any other Contributor, and only if You agree to indemnify,
|
||||
defend, and hold each Contributor harmless for any liability
|
||||
incurred by, or claims asserted against, such Contributor by reason
|
||||
of your accepting any such warranty or additional liability.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
APPENDIX: How to apply the Apache License to your work.
|
||||
|
||||
To apply the Apache License to your work, attach the following
|
||||
boilerplate notice, with the fields enclosed by brackets "{}"
|
||||
replaced with your own identifying information. (Don't include
|
||||
the brackets!) The text should be enclosed in the appropriate
|
||||
comment syntax for the file format. We also recommend that a
|
||||
file or class name and description of purpose be included on the
|
||||
same "printed page" as the copyright notice for easier
|
||||
identification within third-party archives.
|
||||
|
||||
Copyright {yyyy} {name of copyright owner}
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
25
vendor/anstyle-parse/LICENSE-MIT
vendored
Normal file
25
vendor/anstyle-parse/LICENSE-MIT
vendored
Normal file
@ -0,0 +1,25 @@
|
||||
Copyright (c) 2016 Joe Wilm and individual contributors
|
||||
|
||||
Permission is hereby granted, free of charge, to any
|
||||
person obtaining a copy of this software and associated
|
||||
documentation files (the "Software"), to deal in the
|
||||
Software without restriction, including without
|
||||
limitation the rights to use, copy, modify, merge,
|
||||
publish, distribute, sublicense, and/or sell copies of
|
||||
the Software, and to permit persons to whom the Software
|
||||
is furnished to do so, subject to the following
|
||||
conditions:
|
||||
|
||||
The above copyright notice and this permission notice
|
||||
shall be included in all copies or substantial portions
|
||||
of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
|
||||
ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
|
||||
TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
|
||||
PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
|
||||
SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
||||
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
|
||||
IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
DEALINGS IN THE SOFTWARE.
|
33
vendor/anstyle-parse/README.md
vendored
Normal file
33
vendor/anstyle-parse/README.md
vendored
Normal file
@ -0,0 +1,33 @@
|
||||
# anstyle-parse
|
||||
|
||||
> Parse [Parse ANSI Style Escapes](https://vt100.net/emu/dec_ansi_parser)
|
||||
|
||||
[][Documentation]
|
||||

|
||||
[](https://crates.io/crates/anstyle-parse)
|
||||
|
||||
## License
|
||||
|
||||
Licensed under either of
|
||||
|
||||
* Apache License, Version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
|
||||
* MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
|
||||
|
||||
at your option.
|
||||
|
||||
### Contribution
|
||||
|
||||
Unless you explicitly state otherwise, any contribution intentionally
|
||||
submitted for inclusion in the work by you, as defined in the Apache-2.0
|
||||
license, shall be dual licensed as above, without any additional terms or
|
||||
conditions.
|
||||
|
||||
### Special Thanks
|
||||
|
||||
[chrisduerr](https://github.com/alacritty/vte/commits?author=chrisduerr) and the
|
||||
[alacritty project](https://github.com/alacritty/alacritty) for
|
||||
[vte](https://crates.io/crates/vte) which
|
||||
[this was forked from](https://github.com/alacritty/vte/issues/82)
|
||||
|
||||
[Crates.io]: https://crates.io/crates/anstyle-parse
|
||||
[Documentation]: https://docs.rs/anstyle-parse
|
169
vendor/anstyle-parse/benches/parse.rs
vendored
Normal file
169
vendor/anstyle-parse/benches/parse.rs
vendored
Normal file
@ -0,0 +1,169 @@
|
||||
use criterion::{black_box, Criterion};
|
||||
|
||||
use anstyle_parse::*;
|
||||
|
||||
struct BenchDispatcher;
|
||||
impl Perform for BenchDispatcher {
|
||||
fn print(&mut self, c: char) {
|
||||
black_box(c);
|
||||
}
|
||||
|
||||
fn execute(&mut self, byte: u8) {
|
||||
black_box(byte);
|
||||
}
|
||||
|
||||
fn hook(&mut self, params: &Params, intermediates: &[u8], ignore: bool, c: u8) {
|
||||
black_box((params, intermediates, ignore, c));
|
||||
}
|
||||
|
||||
fn put(&mut self, byte: u8) {
|
||||
black_box(byte);
|
||||
}
|
||||
|
||||
fn osc_dispatch(&mut self, params: &[&[u8]], bell_terminated: bool) {
|
||||
black_box((params, bell_terminated));
|
||||
}
|
||||
|
||||
fn csi_dispatch(&mut self, params: &Params, intermediates: &[u8], ignore: bool, c: u8) {
|
||||
black_box((params, intermediates, ignore, c));
|
||||
}
|
||||
|
||||
fn esc_dispatch(&mut self, intermediates: &[u8], ignore: bool, byte: u8) {
|
||||
black_box((intermediates, ignore, byte));
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
struct Strip(String);
|
||||
impl Strip {
|
||||
fn with_capacity(capacity: usize) -> Self {
|
||||
Self(String::with_capacity(capacity))
|
||||
}
|
||||
}
|
||||
impl Perform for Strip {
|
||||
fn print(&mut self, c: char) {
|
||||
self.0.push(c);
|
||||
}
|
||||
|
||||
fn execute(&mut self, byte: u8) {
|
||||
if byte.is_ascii_whitespace() {
|
||||
self.0.push(byte as char);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn strip_str(content: &str) -> String {
|
||||
use anstyle_parse::state::state_change;
|
||||
use anstyle_parse::state::Action;
|
||||
use anstyle_parse::state::State;
|
||||
|
||||
#[inline]
|
||||
fn is_utf8_continuation(b: u8) -> bool {
|
||||
matches!(b, 0x80..=0xbf)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn is_printable(action: Action, byte: u8) -> bool {
|
||||
action == Action::Print
|
||||
|| action == Action::BeginUtf8
|
||||
// since we know the input is valid UTF-8, the only thing we can do with
|
||||
// continuations is to print them
|
||||
|| is_utf8_continuation(byte)
|
||||
|| (action == Action::Execute && byte.is_ascii_whitespace())
|
||||
}
|
||||
|
||||
let mut stripped = Vec::with_capacity(content.len());
|
||||
|
||||
let mut bytes = content.as_bytes();
|
||||
while !bytes.is_empty() {
|
||||
let offset = bytes.iter().copied().position(|b| {
|
||||
let (_next_state, action) = state_change(State::Ground, b);
|
||||
!is_printable(action, b)
|
||||
});
|
||||
let (printable, next) = bytes.split_at(offset.unwrap_or(bytes.len()));
|
||||
stripped.extend(printable);
|
||||
bytes = next;
|
||||
|
||||
let mut state = State::Ground;
|
||||
let offset = bytes.iter().copied().position(|b| {
|
||||
let (next_state, action) = state_change(state, b);
|
||||
if next_state != State::Anywhere {
|
||||
state = next_state;
|
||||
}
|
||||
is_printable(action, b)
|
||||
});
|
||||
let (_, next) = bytes.split_at(offset.unwrap_or(bytes.len()));
|
||||
bytes = next;
|
||||
}
|
||||
|
||||
String::from_utf8(stripped).unwrap()
|
||||
}
|
||||
|
||||
fn parse(c: &mut Criterion) {
|
||||
for (name, content) in [
|
||||
#[cfg(feature = "utf8")]
|
||||
("demo.vte", &include_bytes!("../tests/demo.vte")[..]),
|
||||
("rg_help.vte", &include_bytes!("../tests/rg_help.vte")[..]),
|
||||
("rg_linus.vte", &include_bytes!("../tests/rg_linus.vte")[..]),
|
||||
(
|
||||
"state_changes",
|
||||
&b"\x1b]2;X\x1b\\ \x1b[0m \x1bP0@\x1b\\"[..],
|
||||
),
|
||||
] {
|
||||
// Make sure the comparison is fair
|
||||
if let Ok(content) = std::str::from_utf8(content) {
|
||||
let mut stripped = Strip::with_capacity(content.len());
|
||||
let mut parser = Parser::<DefaultCharAccumulator>::new();
|
||||
for byte in content.as_bytes() {
|
||||
parser.advance(&mut stripped, *byte);
|
||||
}
|
||||
assert_eq!(stripped.0, strip_str(content));
|
||||
}
|
||||
|
||||
let mut group = c.benchmark_group(name);
|
||||
group.bench_function("advance", |b| {
|
||||
b.iter(|| {
|
||||
let mut dispatcher = BenchDispatcher;
|
||||
let mut parser = Parser::<DefaultCharAccumulator>::new();
|
||||
|
||||
for byte in content {
|
||||
parser.advance(&mut dispatcher, *byte);
|
||||
}
|
||||
})
|
||||
});
|
||||
group.bench_function("advance_strip", |b| {
|
||||
b.iter(|| {
|
||||
let mut stripped = Strip::with_capacity(content.len());
|
||||
let mut parser = Parser::<DefaultCharAccumulator>::new();
|
||||
|
||||
for byte in content {
|
||||
parser.advance(&mut stripped, *byte);
|
||||
}
|
||||
|
||||
black_box(stripped.0)
|
||||
})
|
||||
});
|
||||
group.bench_function("state_change", |b| {
|
||||
b.iter(|| {
|
||||
let mut state = anstyle_parse::state::State::default();
|
||||
for byte in content {
|
||||
let (next_state, action) = anstyle_parse::state::state_change(state, *byte);
|
||||
state = next_state;
|
||||
black_box(action);
|
||||
}
|
||||
})
|
||||
});
|
||||
if let Ok(content) = std::str::from_utf8(content) {
|
||||
group.bench_function("state_change_strip_str", |b| {
|
||||
b.iter(|| {
|
||||
let stripped = strip_str(content);
|
||||
|
||||
black_box(stripped)
|
||||
})
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
criterion::criterion_group!(benches, parse);
|
||||
criterion::criterion_main!(benches);
|
78
vendor/anstyle-parse/examples/parselog.rs
vendored
Normal file
78
vendor/anstyle-parse/examples/parselog.rs
vendored
Normal file
@ -0,0 +1,78 @@
|
||||
//! Parse input from stdin and log actions on stdout
|
||||
use std::io::{self, Read};
|
||||
|
||||
use anstyle_parse::{DefaultCharAccumulator, Params, Parser, Perform};
|
||||
|
||||
/// A type implementing Perform that just logs actions
|
||||
struct Log;
|
||||
|
||||
impl Perform for Log {
|
||||
fn print(&mut self, c: char) {
|
||||
println!("[print] {:?}", c);
|
||||
}
|
||||
|
||||
fn execute(&mut self, byte: u8) {
|
||||
println!("[execute] {:02x}", byte);
|
||||
}
|
||||
|
||||
fn hook(&mut self, params: &Params, intermediates: &[u8], ignore: bool, c: u8) {
|
||||
println!(
|
||||
"[hook] params={:?}, intermediates={:?}, ignore={:?}, char={:?}",
|
||||
params, intermediates, ignore, c
|
||||
);
|
||||
}
|
||||
|
||||
fn put(&mut self, byte: u8) {
|
||||
println!("[put] {:02x}", byte);
|
||||
}
|
||||
|
||||
fn unhook(&mut self) {
|
||||
println!("[unhook]");
|
||||
}
|
||||
|
||||
fn osc_dispatch(&mut self, params: &[&[u8]], bell_terminated: bool) {
|
||||
println!(
|
||||
"[osc_dispatch] params={:?} bell_terminated={}",
|
||||
params, bell_terminated
|
||||
);
|
||||
}
|
||||
|
||||
fn csi_dispatch(&mut self, params: &Params, intermediates: &[u8], ignore: bool, c: u8) {
|
||||
println!(
|
||||
"[csi_dispatch] params={:#?}, intermediates={:?}, ignore={:?}, char={:?}",
|
||||
params, intermediates, ignore, c
|
||||
);
|
||||
}
|
||||
|
||||
fn esc_dispatch(&mut self, intermediates: &[u8], ignore: bool, byte: u8) {
|
||||
println!(
|
||||
"[esc_dispatch] intermediates={:?}, ignore={:?}, byte={:02x}",
|
||||
intermediates, ignore, byte
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
fn main() {
|
||||
let input = io::stdin();
|
||||
let mut handle = input.lock();
|
||||
|
||||
let mut statemachine = Parser::<DefaultCharAccumulator>::new();
|
||||
let mut performer = Log;
|
||||
|
||||
let mut buf = [0; 2048];
|
||||
|
||||
loop {
|
||||
match handle.read(&mut buf) {
|
||||
Ok(0) => break,
|
||||
Ok(n) => {
|
||||
for byte in &buf[..n] {
|
||||
statemachine.advance(&mut performer, *byte);
|
||||
}
|
||||
}
|
||||
Err(err) => {
|
||||
println!("err: {}", err);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user