From c5b3f8ee98203191d83d3cfca39bb0f35ee6efc2 Mon Sep 17 00:00:00 2001 From: Laurenz Date: Sat, 16 Apr 2022 22:23:57 +0200 Subject: [PATCH] Switch to `unscanny` --- Cargo.lock | 6 + Cargo.toml | 1 + benches/oneshot.rs | 3 +- src/library/structure/list.rs | 9 +- src/parse/mod.rs | 2 - src/parse/parser.rs | 23 ++-- src/parse/resolve.rs | 20 ++-- src/parse/scanner.rs | 211 ---------------------------------- src/parse/tokens.rs | 135 +++++++++++++--------- src/source.rs | 10 +- tests/typeset.rs | 6 +- 11 files changed, 127 insertions(+), 299 deletions(-) delete mode 100644 src/parse/scanner.rs diff --git a/Cargo.lock b/Cargo.lock index 2341a52e8..5a2e079eb 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -876,6 +876,7 @@ dependencies = [ "unicode-script", "unicode-segmentation", "unicode-xid", + "unscanny", "usvg", "walkdir", "xi-unicode", @@ -938,6 +939,11 @@ version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8ccb82d61f80a663efe1f787a51b16b5a51e3314d6ac365b08639f52387b33f3" +[[package]] +name = "unscanny" +version = "0.1.0" +source = "git+https://github.com/typst/unscanny#c943791649841388803b7ca873ce72683903fd39" + [[package]] name = "usvg" version = "0.20.0" diff --git a/Cargo.toml b/Cargo.toml index fa7449afd..341a96d10 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -21,6 +21,7 @@ once_cell = "1" serde = { version = "1", features = ["derive"] } typed-arena = "2" parking_lot = "0.12" +unscanny = { git = "https://github.com/typst/unscanny" } # Text and font handling hypher = "0.1" diff --git a/benches/oneshot.rs b/benches/oneshot.rs index ddd689703..c972d1bc4 100644 --- a/benches/oneshot.rs +++ b/benches/oneshot.rs @@ -1,9 +1,10 @@ use std::path::Path; use iai::{black_box, main, Iai}; +use unscanny::Scanner; use typst::loading::MemLoader; -use typst::parse::{parse, Scanner, TokenMode, Tokens}; +use typst::parse::{parse, TokenMode, Tokens}; use typst::source::SourceId; use typst::Context; diff --git a/src/library/structure/list.rs b/src/library/structure/list.rs index 9d5d3a7d5..10dcfb7b9 100644 --- a/src/library/structure/list.rs +++ b/src/library/structure/list.rs @@ -1,8 +1,9 @@ +use unscanny::Scanner; + use crate::library::layout::{GridNode, TrackSizing}; use crate::library::prelude::*; use crate::library::text::ParNode; use crate::library::utility::Numbering; -use crate::parse::Scanner; /// An unordered (bulleted) or ordered (numbered) list. #[derive(Debug, Hash)] @@ -190,7 +191,7 @@ impl Cast> for Label { let mut s = Scanner::new(&pattern); let mut prefix; let numbering = loop { - prefix = s.eaten(); + prefix = s.before(); match s.eat().map(|c| c.to_ascii_lowercase()) { Some('1') => break Numbering::Arabic, Some('a') => break Numbering::Letter, @@ -200,8 +201,8 @@ impl Cast> for Label { None => Err("invalid pattern")?, } }; - let upper = s.prev(0).map_or(false, char::is_uppercase); - let suffix = s.rest().into(); + let upper = s.scout(-1).map_or(false, char::is_uppercase); + let suffix = s.after().into(); Ok(Self::Pattern(prefix.into(), numbering, upper, suffix)) } Value::Content(v) => Ok(Self::Content(v)), diff --git a/src/parse/mod.rs b/src/parse/mod.rs index 47cba1119..7536b2ca0 100644 --- a/src/parse/mod.rs +++ b/src/parse/mod.rs @@ -3,13 +3,11 @@ mod incremental; mod parser; mod resolve; -mod scanner; mod tokens; pub use incremental::*; pub use parser::*; pub use resolve::*; -pub use scanner::*; pub use tokens::*; use std::collections::HashSet; diff --git a/src/parse/parser.rs b/src/parse/parser.rs index 98adfba26..4bbbdc28f 100644 --- a/src/parse/parser.rs +++ b/src/parse/parser.rs @@ -1,6 +1,6 @@ -use core::slice::SliceIndex; use std::fmt::{self, Display, Formatter}; use std::mem; +use std::ops::Range; use super::{TokenMode, Tokens}; use crate::syntax::{ErrorPos, Green, GreenData, GreenNode, NodeKind}; @@ -116,7 +116,7 @@ impl<'s> Parser<'s> { _ => false, }; - self.prev_end = self.tokens.index(); + self.prev_end = self.tokens.cursor(); self.bump(); if self.tokens.mode() == TokenMode::Code { @@ -186,15 +186,12 @@ impl<'s> Parser<'s> { /// Peek at the source of the current token. pub fn peek_src(&self) -> &'s str { - self.tokens.scanner().get(self.current_start() .. self.current_end()) + self.get(self.current_start() .. self.current_end()) } /// Obtain a range of the source code. - pub fn get(&self, index: I) -> &'s str - where - I: SliceIndex, - { - self.tokens.scanner().get(index) + pub fn get(&self, range: Range) -> &'s str { + self.tokens.scanner().get(range) } /// The byte index at which the last non-trivia token ended. @@ -209,7 +206,7 @@ impl<'s> Parser<'s> { /// The byte index at which the current token ends. pub fn current_end(&self) -> usize { - self.tokens.index() + self.tokens.cursor() } /// Determine the column index for the given byte index. @@ -294,8 +291,8 @@ impl<'s> Parser<'s> { } self.tokens.jump(target); - self.prev_end = self.tokens.index(); - self.current_start = self.tokens.index(); + self.prev_end = self.tokens.cursor(); + self.current_start = self.tokens.cursor(); self.current = self.tokens.next(); } @@ -311,9 +308,9 @@ impl<'s> Parser<'s> { /// handling. fn bump(&mut self) { let kind = self.current.take().unwrap(); - let len = self.tokens.index() - self.current_start; + let len = self.tokens.cursor() - self.current_start; self.children.push(GreenData::new(kind, len).into()); - self.current_start = self.tokens.index(); + self.current_start = self.tokens.cursor(); self.current = self.tokens.next(); } diff --git a/src/parse/resolve.rs b/src/parse/resolve.rs index 0d4cf071f..dd9ed4f49 100644 --- a/src/parse/resolve.rs +++ b/src/parse/resolve.rs @@ -1,4 +1,6 @@ -use super::{is_ident, is_newline, Scanner}; +use unscanny::Scanner; + +use super::{is_ident, is_newline}; use crate::syntax::ast::RawNode; use crate::util::EcoString; @@ -13,7 +15,7 @@ pub fn resolve_string(string: &str) -> EcoString { continue; } - let start = s.last_index(); + let start = s.locate(-1); match s.eat() { Some('\\') => out.push('\\'), Some('"') => out.push('"'), @@ -22,17 +24,17 @@ pub fn resolve_string(string: &str) -> EcoString { Some('t') => out.push('\t'), Some('u') if s.eat_if('{') => { // TODO: Feedback if closing brace is missing. - let sequence = s.eat_while(|c| c.is_ascii_hexdigit()); + let sequence = s.eat_while(char::is_ascii_hexdigit); let _terminated = s.eat_if('}'); match resolve_hex(sequence) { Some(c) => out.push(c), - None => out.push_str(s.eaten_from(start)), + None => out.push_str(s.from(start)), } } // TODO: Feedback about invalid escape sequence. - _ => out.push_str(s.eaten_from(start)), + _ => out.push_str(s.from(start)), } } @@ -68,8 +70,8 @@ pub fn resolve_raw(column: usize, backticks: usize, text: &str) -> RawNode { fn split_at_lang_tag(raw: &str) -> (&str, &str) { let mut s = Scanner::new(raw); ( - s.eat_until(|c| c == '`' || c.is_whitespace() || is_newline(c)), - s.rest(), + s.eat_until(|c: char| c == '`' || c.is_whitespace() || is_newline(c)), + s.after(), ) } @@ -129,9 +131,9 @@ fn split_lines(text: &str) -> Vec<&str> { } lines.push(&text[start .. end]); - start = s.index(); + start = s.cursor(); } - end = s.index(); + end = s.cursor(); } lines.push(&text[start ..]); diff --git a/src/parse/scanner.rs b/src/parse/scanner.rs deleted file mode 100644 index e4cf56e97..000000000 --- a/src/parse/scanner.rs +++ /dev/null @@ -1,211 +0,0 @@ -use std::slice::SliceIndex; - -use unicode_xid::UnicodeXID; - -/// A featureful char-based scanner. -#[derive(Copy, Clone)] -pub struct Scanner<'s> { - /// The string to scan. - src: &'s str, - /// The index at which the peekable character starts. Must be in bounds and - /// at a codepoint boundary to guarantee safety. - index: usize, -} - -impl<'s> Scanner<'s> { - /// Create a new char scanner. - #[inline] - pub fn new(src: &'s str) -> Self { - Self { src, index: 0 } - } - - /// Whether the end of the string is reached. - pub fn eof(&self) -> bool { - self.index == self.src.len() - } - - /// Consume the next char. - #[inline] - pub fn eat(&mut self) -> Option { - let next = self.peek(); - if let Some(c) = next { - self.index += c.len_utf8(); - } - next - } - - /// Consume the next char if it is the given one. - /// - /// Returns whether the char was consumed. - #[inline] - pub fn eat_if(&mut self, c: char) -> bool { - let matches = self.peek() == Some(c); - if matches { - self.index += c.len_utf8(); - } - matches - } - - /// Consume the next char, debug-asserting that it is the given one. - #[inline] - pub fn eat_assert(&mut self, c: char) { - let next = self.eat(); - debug_assert_eq!(next, Some(c)); - } - - /// Eat chars while the condition is true. - #[inline] - pub fn eat_while(&mut self, mut f: F) -> &'s str - where - F: FnMut(char) -> bool, - { - self.eat_until(|c| !f(c)) - } - - /// Eat chars until the condition is true. - #[inline] - pub fn eat_until(&mut self, mut f: F) -> &'s str - where - F: FnMut(char) -> bool, - { - let start = self.index; - while let Some(c) = self.peek() { - if f(c) { - break; - } - self.index += c.len_utf8(); - } - self.eaten_from(start) - } - - /// Uneat the last eaten char. - #[inline] - pub fn uneat(&mut self) { - self.index = self.last_index(); - } - - /// Peek at the next char without consuming it. - #[inline] - pub fn peek(&self) -> Option { - self.rest().chars().next() - } - - /// Get the nth-previous eaten char. - #[inline] - pub fn prev(&self, n: usize) -> Option { - self.eaten().chars().nth_back(n) - } - - /// Checks whether the next char fulfills a condition. - /// - /// Returns `default` if there is no next char. - #[inline] - pub fn check_or(&self, default: bool, f: F) -> bool - where - F: FnOnce(char) -> bool, - { - self.peek().map_or(default, f) - } - - /// The previous index in the source string. - #[inline] - pub fn last_index(&self) -> usize { - self.eaten().chars().last().map_or(0, |c| self.index - c.len_utf8()) - } - - /// The current index in the source string. - #[inline] - pub fn index(&self) -> usize { - self.index - } - - /// Jump to an index in the source string. - #[inline] - pub fn jump(&mut self, index: usize) { - // Make sure that the index is in bounds and on a codepoint boundary. - self.src.get(index ..).expect("jumped to invalid index"); - self.index = index; - } - - /// The full source string. - #[inline] - pub fn src(&self) -> &'s str { - self.src - } - - /// Slice out part of the source string. - #[inline] - pub fn get(&self, index: I) -> &'s str - where - I: SliceIndex, - { - // See `eaten_from` for details about `unwrap_or_default`. - self.src.get(index).unwrap_or_default() - } - - /// The remaining source string after the current index. - #[inline] - pub fn rest(&self) -> &'s str { - // Safety: The index is always in bounds and on a codepoint boundary - // since it starts at zero and is is: - // - either increased by the length of a scanned character, advacing - // from one codepoint boundary to the next, - // - or checked upon jumping. - unsafe { self.src.get_unchecked(self.index ..) } - } - - /// The full source string up to the current index. - #[inline] - pub fn eaten(&self) -> &'s str { - // Safety: The index is always okay, for details see `rest()`. - unsafe { self.src.get_unchecked(.. self.index) } - } - - /// The source string from `start` to the current index. - #[inline] - pub fn eaten_from(&self, start: usize) -> &'s str { - // Using `unwrap_or_default` is much faster than unwrap, probably - // because then the whole call to `eaten_from` is pure and can be - // optimized away in some cases. - self.src.get(start .. self.index).unwrap_or_default() - } -} - -/// Whether this character denotes a newline. -#[inline] -pub fn is_newline(character: char) -> bool { - matches!( - character, - // Line Feed, Vertical Tab, Form Feed, Carriage Return. - '\n' | '\x0B' | '\x0C' | '\r' | - // Next Line, Line Separator, Paragraph Separator. - '\u{0085}' | '\u{2028}' | '\u{2029}' - ) -} - -/// Whether a string is a valid unicode identifier. -/// -/// In addition to what is specified in the [Unicode Standard][uax31], we allow: -/// - `_` as a starting character, -/// - `_` and `-` as continuing characters. -/// -/// [uax31]: http://www.unicode.org/reports/tr31/ -#[inline] -pub fn is_ident(string: &str) -> bool { - let mut chars = string.chars(); - chars - .next() - .map_or(false, |c| is_id_start(c) && chars.all(is_id_continue)) -} - -/// Whether a character can start an identifier. -#[inline] -pub fn is_id_start(c: char) -> bool { - c.is_xid_start() || c == '_' -} - -/// Whether a character can continue an identifier. -#[inline] -pub fn is_id_continue(c: char) -> bool { - c.is_xid_continue() || c == '_' || c == '-' -} diff --git a/src/parse/tokens.rs b/src/parse/tokens.rs index 053a7f61e..ae3d7b9c5 100644 --- a/src/parse/tokens.rs +++ b/src/parse/tokens.rs @@ -1,9 +1,9 @@ use std::sync::Arc; -use super::{ - is_id_continue, is_id_start, is_newline, resolve_hex, resolve_raw, resolve_string, - Scanner, -}; +use unicode_xid::UnicodeXID; +use unscanny::Scanner; + +use super::{resolve_hex, resolve_raw, resolve_string}; use crate::geom::{AngleUnit, LengthUnit}; use crate::syntax::ast::{MathNode, RawNode, Unit}; use crate::syntax::{ErrorPos, NodeKind}; @@ -65,13 +65,11 @@ impl<'s> Tokens<'s> { /// The index in the string at which the last token ends and next token /// will start. #[inline] - pub fn index(&self) -> usize { - self.s.index() + pub fn cursor(&self) -> usize { + self.s.cursor() } /// Jump to the given index in the string. - /// - /// You need to know the correct column. #[inline] pub fn jump(&mut self, index: usize) { self.s.jump(index); @@ -92,7 +90,7 @@ impl<'s> Tokens<'s> { /// The column index of a given index in the source string. #[inline] pub fn column(&self, index: usize) -> usize { - column(self.s.src(), index, self.column_offset) + column(self.s.string(), index, self.column_offset) } } @@ -102,7 +100,7 @@ impl<'s> Iterator for Tokens<'s> { /// Parse the next token in the source code. #[inline] fn next(&mut self) -> Option { - let start = self.s.index(); + let start = self.s.cursor(); let c = self.s.eat()?; Some(match c { // Blocks. @@ -112,15 +110,13 @@ impl<'s> Iterator for Tokens<'s> { ']' => NodeKind::RightBracket, // Whitespace. - ' ' if self.s.check_or(true, |c| !c.is_whitespace()) => NodeKind::Space(0), + ' ' if self.s.done() || !self.s.at(char::is_whitespace) => NodeKind::Space(0), c if c.is_whitespace() => self.whitespace(), // Comments with special case for URLs. '/' if self.s.eat_if('*') => self.block_comment(), '/' if !self.maybe_in_url() && self.s.eat_if('/') => self.line_comment(), - '*' if self.s.eat_if('/') => { - NodeKind::Unknown(self.s.eaten_from(start).into()) - } + '*' if self.s.eat_if('/') => NodeKind::Unknown(self.s.from(start).into()), // Other things. _ => match self.mode { @@ -187,22 +183,20 @@ impl<'s> Tokens<'s> { '=' => NodeKind::Eq, '<' => NodeKind::Lt, '>' => NodeKind::Gt, - '.' if self.s.check_or(true, |n| !n.is_ascii_digit()) => NodeKind::Dot, + '.' if self.s.done() || !self.s.at(char::is_ascii_digit) => NodeKind::Dot, // Identifiers. c if is_id_start(c) => self.ident(start), // Numbers. - c if c.is_ascii_digit() - || (c == '.' && self.s.check_or(false, |n| n.is_ascii_digit())) => - { + c if c.is_ascii_digit() || (c == '.' && self.s.at(char::is_ascii_digit)) => { self.number(start, c) } // Strings. '"' => self.string(), - _ => NodeKind::Unknown(self.s.eaten_from(start).into()), + _ => NodeKind::Unknown(self.s.from(start).into()), } } @@ -226,19 +220,19 @@ impl<'s> Tokens<'s> { }; loop { - self.s.eat_until(|c| { + self.s.eat_until(|c: char| { TABLE.get(c as usize).copied().unwrap_or_else(|| c.is_whitespace()) }); let mut s = self.s; - if !(s.eat_if(' ') && s.check_or(false, char::is_alphanumeric)) { + if !(s.eat_if(' ') && s.at(char::is_alphanumeric)) { break; } self.s.eat(); } - NodeKind::Text(self.s.eaten_from(start).into()) + NodeKind::Text(self.s.from(start).into()) } fn whitespace(&mut self) -> NodeKind { @@ -276,13 +270,11 @@ impl<'s> Tokens<'s> { '[' | ']' | '{' | '}' | '#' | // Markup. '~' | '\'' | '"' | '*' | '_' | '`' | '$' | '=' | '-' | '.' => { - self.s.eat_assert(c) ; + self.s.expect(c); NodeKind::Escape(c) } - 'u' if self.s.rest().starts_with("u{") => { - self.s.eat_assert('u'); - self.s.eat_assert('{'); - let sequence = self.s.eat_while(|c| c.is_ascii_alphanumeric()); + 'u' if self.s.eat_if("u{") => { + let sequence = self.s.eat_while(char::is_ascii_alphanumeric); if self.s.eat_if('}') { if let Some(c) = resolve_hex(sequence) { NodeKind::Escape(c) @@ -304,7 +296,7 @@ impl<'s> Tokens<'s> { // Linebreaks. c if c.is_whitespace() => NodeKind::Linebreak(false), '+' => { - self.s.eat_assert(c); + self.s.expect(c); NodeKind::Linebreak(true) } @@ -315,7 +307,7 @@ impl<'s> Tokens<'s> { #[inline] fn hash(&mut self) -> NodeKind { - if self.s.check_or(false, is_id_start) { + if self.s.at(is_id_start) { let read = self.s.eat_while(is_id_continue); match keyword(read) { Some(keyword) => keyword, @@ -342,10 +334,10 @@ impl<'s> Tokens<'s> { fn numbering(&mut self, start: usize, c: char) -> NodeKind { let number = if c != '.' { - self.s.eat_while(|c| c.is_ascii_digit()); - let read = self.s.eaten_from(start); + self.s.eat_while(char::is_ascii_digit); + let read = self.s.from(start); if !self.s.eat_if('.') { - return NodeKind::Text(self.s.eaten_from(start).into()); + return NodeKind::Text(self.s.from(start).into()); } read.parse().ok() } else { @@ -356,7 +348,7 @@ impl<'s> Tokens<'s> { } fn raw(&mut self) -> NodeKind { - let column = self.column(self.s.index() - 1); + let column = self.column(self.s.cursor() - 1); let mut backticks = 1; while self.s.eat_if('`') { @@ -372,7 +364,7 @@ impl<'s> Tokens<'s> { })); } - let start = self.s.index(); + let start = self.s.cursor(); let mut found = 0; while found < backticks { @@ -384,7 +376,7 @@ impl<'s> Tokens<'s> { } if found == backticks { - let end = self.s.index() - found as usize; + let end = self.s.cursor() - found as usize; NodeKind::Raw(Arc::new(resolve_raw( column, backticks, @@ -412,7 +404,7 @@ impl<'s> Tokens<'s> { display = true; } - let start = self.s.index(); + let start = self.s.cursor(); let mut escaped = false; let mut dollar = !display; @@ -429,7 +421,7 @@ impl<'s> Tokens<'s> { } }; - let end = self.s.index() + let end = self.s.cursor() - match (terminated, display) { (false, _) => 0, (true, false) => 1, @@ -456,7 +448,7 @@ impl<'s> Tokens<'s> { fn ident(&mut self, start: usize) -> NodeKind { self.s.eat_while(is_id_continue); - match self.s.eaten_from(start) { + match self.s.from(start) { "none" => NodeKind::None, "auto" => NodeKind::Auto, "true" => NodeKind::Bool(true), @@ -467,30 +459,29 @@ impl<'s> Tokens<'s> { fn number(&mut self, start: usize, c: char) -> NodeKind { // Read the first part (integer or fractional depending on `first`). - self.s.eat_while(|c| c.is_ascii_digit()); + self.s.eat_while(char::is_ascii_digit); // Read the fractional part if not already done. // Make sure not to confuse a range for the decimal separator. - if c != '.' && !self.s.rest().starts_with("..") && self.s.eat_if('.') { - self.s.eat_while(|c| c.is_ascii_digit()); + if c != '.' && !self.s.at("..") && self.s.eat_if('.') { + self.s.eat_while(char::is_ascii_digit); } // Read the exponent. - let em = self.s.rest().starts_with("em"); - if !em && self.s.eat_if('e') || self.s.eat_if('E') { - let _ = self.s.eat_if('+') || self.s.eat_if('-'); - self.s.eat_while(|c| c.is_ascii_digit()); + if !self.s.at("em") && self.s.eat_if(['e', 'E']) { + self.s.eat_if(['+', '-']); + self.s.eat_while(char::is_ascii_digit); } // Read the suffix. - let suffix_start = self.s.index(); + let suffix_start = self.s.cursor(); if !self.s.eat_if('%') { - self.s.eat_while(|c| c.is_ascii_alphanumeric()); + self.s.eat_while(char::is_ascii_alphanumeric); } let number = self.s.get(start .. suffix_start); - let suffix = self.s.eaten_from(suffix_start); - let all = self.s.eaten_from(start); + let suffix = self.s.from(suffix_start); + let all = self.s.from(start); // Find out whether it is a simple number. if suffix.is_empty() { @@ -575,13 +566,13 @@ impl<'s> Tokens<'s> { fn in_word(&self) -> bool { let alphanumeric = |c: Option| c.map_or(false, |c| c.is_alphanumeric()); - let prev = self.s.prev(1); + let prev = self.s.scout(-2); let next = self.s.peek(); alphanumeric(prev) && alphanumeric(next) } fn maybe_in_url(&self) -> bool { - self.mode == TokenMode::Markup && self.s.eaten().ends_with(":/") + self.mode == TokenMode::Markup && self.s.before().ends_with(":/") } } @@ -610,7 +601,8 @@ fn keyword(ident: &str) -> Option { }) } -/// The column index of a given index in the source string, given a column offset for the first line. +/// The column index of a given index in the source string, given a column +/// offset for the first line. #[inline] fn column(string: &str, index: usize, offset: usize) -> usize { let mut apply_offset = false; @@ -634,6 +626,45 @@ fn column(string: &str, index: usize, offset: usize) -> usize { if apply_offset { res + offset } else { res } } +/// Whether this character denotes a newline. +#[inline] +pub fn is_newline(character: char) -> bool { + matches!( + character, + // Line Feed, Vertical Tab, Form Feed, Carriage Return. + '\n' | '\x0B' | '\x0C' | '\r' | + // Next Line, Line Separator, Paragraph Separator. + '\u{0085}' | '\u{2028}' | '\u{2029}' + ) +} + +/// Whether a string is a valid unicode identifier. +/// +/// In addition to what is specified in the [Unicode Standard][uax31], we allow: +/// - `_` as a starting character, +/// - `_` and `-` as continuing characters. +/// +/// [uax31]: http://www.unicode.org/reports/tr31/ +#[inline] +pub fn is_ident(string: &str) -> bool { + let mut chars = string.chars(); + chars + .next() + .map_or(false, |c| is_id_start(c) && chars.all(is_id_continue)) +} + +/// Whether a character can start an identifier. +#[inline] +pub fn is_id_start(c: char) -> bool { + c.is_xid_start() || c == '_' +} + +/// Whether a character can continue an identifier. +#[inline] +pub fn is_id_continue(c: char) -> bool { + c.is_xid_continue() || c == '_' || c == '-' +} + #[cfg(test)] #[allow(non_snake_case)] mod tests { diff --git a/src/source.rs b/src/source.rs index 9f2a01403..37aa96cac 100644 --- a/src/source.rs +++ b/src/source.rs @@ -6,9 +6,11 @@ use std::ops::Range; use std::path::{Path, PathBuf}; use std::sync::Arc; +use unscanny::Scanner; + use crate::diag::TypResult; use crate::loading::{FileHash, Loader}; -use crate::parse::{is_newline, parse, Reparser, Scanner}; +use crate::parse::{is_newline, parse, Reparser}; use crate::syntax::ast::Markup; use crate::syntax::{self, Category, GreenNode, RedNode}; use crate::util::{PathExt, StrExt}; @@ -382,12 +384,12 @@ impl Line { let mut utf16_idx = utf16_offset; std::iter::from_fn(move || { - s.eat_until(|c| { + s.eat_until(|c: char| { utf16_idx += c.len_utf16(); is_newline(c) }); - if s.eof() { + if s.done() { return None; } @@ -396,7 +398,7 @@ impl Line { } Some(Line { - byte_idx: byte_offset + s.index(), + byte_idx: byte_offset + s.cursor(), utf16_idx, }) }) diff --git a/tests/typeset.rs b/tests/typeset.rs index bba826219..02d3ee389 100644 --- a/tests/typeset.rs +++ b/tests/typeset.rs @@ -6,6 +6,7 @@ use std::path::Path; use std::sync::Arc; use tiny_skia as sk; +use unscanny::Scanner; use walkdir::WalkDir; use typst::diag::Error; @@ -15,7 +16,6 @@ use typst::geom::{Length, RgbaColor}; use typst::library::layout::PageNode; use typst::library::text::{TextNode, TextSize}; use typst::loading::FsLoader; -use typst::parse::Scanner; use typst::source::SourceFile; use typst::syntax::Span; use typst::{bail, Context}; @@ -329,7 +329,7 @@ fn parse_metadata(source: &SourceFile) -> (Option, Vec) { }; fn num(s: &mut Scanner) -> usize { - s.eat_while(|c| c.is_numeric()).parse().unwrap() + s.eat_while(char::is_numeric).parse().unwrap() } let comments = @@ -348,7 +348,7 @@ fn parse_metadata(source: &SourceFile) -> (Option, Vec) { let end = if s.eat_if('-') { pos(&mut s) } else { start }; let span = Span::new(source.id(), start, end); - errors.push(Error::new(span, s.rest().trim())); + errors.push(Error::new(span, s.after().trim())); } (compare_ref, errors)