Switch to unscanny

2022-04-16 22:23:57 +02:00 · 2022-04-16 22:23:57 +02:00 · c5b3f8ee98
commit c5b3f8ee98
parent 2db4b603db
11 changed files with 127 additions and 299 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -876,6 +876,7 @@ dependencies = [
 "unicode-script",
 "unicode-segmentation",
 "unicode-xid",
+ "unscanny",
 "usvg",
 "walkdir",
 "xi-unicode",
@ -938,6 +939,11 @@ version = "0.2.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8ccb82d61f80a663efe1f787a51b16b5a51e3314d6ac365b08639f52387b33f3"

+[[package]]
+name = "unscanny"
+version = "0.1.0"
+source = "git+https://github.com/typst/unscanny#c943791649841388803b7ca873ce72683903fd39"
+
 [[package]]
 name = "usvg"
 version = "0.20.0"
--- a/Cargo.toml
+++ b/Cargo.toml
@ -21,6 +21,7 @@ once_cell = "1"
 serde = { version = "1", features = ["derive"] }
 typed-arena = "2"
 parking_lot = "0.12"
+unscanny = { git = "https://github.com/typst/unscanny" }

 # Text and font handling
 hypher = "0.1"
--- a/benches/oneshot.rs
+++ b/benches/oneshot.rs
@ -1,9 +1,10 @@
 use std::path::Path;

 use iai::{black_box, main, Iai};
+use unscanny::Scanner;

 use typst::loading::MemLoader;
-use typst::parse::{parse, Scanner, TokenMode, Tokens};
+use typst::parse::{parse, TokenMode, Tokens};
 use typst::source::SourceId;
 use typst::Context;

--- a/src/library/structure/list.rs
+++ b/src/library/structure/list.rs
@ -1,8 +1,9 @@
+use unscanny::Scanner;
+
 use crate::library::layout::{GridNode, TrackSizing};
 use crate::library::prelude::*;
 use crate::library::text::ParNode;
 use crate::library::utility::Numbering;
-use crate::parse::Scanner;

 /// An unordered (bulleted) or ordered (numbered) list.
 #[derive(Debug, Hash)]
@ -190,7 +191,7 @@ impl Cast<Spanned<Value>> for Label {
                let mut s = Scanner::new(&pattern);
                let mut prefix;
                let numbering = loop {
-                    prefix = s.eaten();
+                    prefix = s.before();
                    match s.eat().map(|c| c.to_ascii_lowercase()) {
                        Some('1') => break Numbering::Arabic,
                        Some('a') => break Numbering::Letter,
@ -200,8 +201,8 @@ impl Cast<Spanned<Value>> for Label {
                        None => Err("invalid pattern")?,
                    }
                };
-                let upper = s.prev(0).map_or(false, char::is_uppercase);
-                let suffix = s.rest().into();
+                let upper = s.scout(-1).map_or(false, char::is_uppercase);
+                let suffix = s.after().into();
                Ok(Self::Pattern(prefix.into(), numbering, upper, suffix))
            }
            Value::Content(v) => Ok(Self::Content(v)),
--- a/src/parse/mod.rs
+++ b/src/parse/mod.rs
@ -3,13 +3,11 @@
 mod incremental;
 mod parser;
 mod resolve;
-mod scanner;
 mod tokens;

 pub use incremental::*;
 pub use parser::*;
 pub use resolve::*;
-pub use scanner::*;
 pub use tokens::*;

 use std::collections::HashSet;
--- a/src/parse/parser.rs
+++ b/src/parse/parser.rs
@ -1,6 +1,6 @@
-use core::slice::SliceIndex;
 use std::fmt::{self, Display, Formatter};
 use std::mem;
+use std::ops::Range;

 use super::{TokenMode, Tokens};
 use crate::syntax::{ErrorPos, Green, GreenData, GreenNode, NodeKind};
@ -116,7 +116,7 @@ impl<'s> Parser<'s> {
            _ => false,
        };

-        self.prev_end = self.tokens.index();
+        self.prev_end = self.tokens.cursor();
        self.bump();

        if self.tokens.mode() == TokenMode::Code {
@ -186,15 +186,12 @@ impl<'s> Parser<'s> {

    /// Peek at the source of the current token.
    pub fn peek_src(&self) -> &'s str {
-        self.tokens.scanner().get(self.current_start() .. self.current_end())
+        self.get(self.current_start() .. self.current_end())
    }

    /// Obtain a range of the source code.
-    pub fn get<I>(&self, index: I) -> &'s str
-    where
-        I: SliceIndex<str, Output = str>,
-    {
-        self.tokens.scanner().get(index)
+    pub fn get(&self, range: Range<usize>) -> &'s str {
+        self.tokens.scanner().get(range)
    }

    /// The byte index at which the last non-trivia token ended.
@ -209,7 +206,7 @@ impl<'s> Parser<'s> {

    /// The byte index at which the current token ends.
    pub fn current_end(&self) -> usize {
-        self.tokens.index()
+        self.tokens.cursor()
    }

    /// Determine the column index for the given byte index.
@ -294,8 +291,8 @@ impl<'s> Parser<'s> {
            }

            self.tokens.jump(target);
-            self.prev_end = self.tokens.index();
-            self.current_start = self.tokens.index();
+            self.prev_end = self.tokens.cursor();
+            self.current_start = self.tokens.cursor();
            self.current = self.tokens.next();
        }

@ -311,9 +308,9 @@ impl<'s> Parser<'s> {
    /// handling.
    fn bump(&mut self) {
        let kind = self.current.take().unwrap();
-        let len = self.tokens.index() - self.current_start;
+        let len = self.tokens.cursor() - self.current_start;
        self.children.push(GreenData::new(kind, len).into());
-        self.current_start = self.tokens.index();
+        self.current_start = self.tokens.cursor();
        self.current = self.tokens.next();
    }

--- a/src/parse/resolve.rs
+++ b/src/parse/resolve.rs
@ -1,4 +1,6 @@
-use super::{is_ident, is_newline, Scanner};
+use unscanny::Scanner;
+
+use super::{is_ident, is_newline};
 use crate::syntax::ast::RawNode;
 use crate::util::EcoString;

@ -13,7 +15,7 @@ pub fn resolve_string(string: &str) -> EcoString {
            continue;
        }

-        let start = s.last_index();
+        let start = s.locate(-1);
        match s.eat() {
            Some('\\') => out.push('\\'),
            Some('"') => out.push('"'),
@ -22,17 +24,17 @@ pub fn resolve_string(string: &str) -> EcoString {
            Some('t') => out.push('\t'),
            Some('u') if s.eat_if('{') => {
                // TODO: Feedback if closing brace is missing.
-                let sequence = s.eat_while(|c| c.is_ascii_hexdigit());
+                let sequence = s.eat_while(char::is_ascii_hexdigit);
                let _terminated = s.eat_if('}');

                match resolve_hex(sequence) {
                    Some(c) => out.push(c),
-                    None => out.push_str(s.eaten_from(start)),
+                    None => out.push_str(s.from(start)),
                }
            }

            // TODO: Feedback about invalid escape sequence.
-            _ => out.push_str(s.eaten_from(start)),
+            _ => out.push_str(s.from(start)),
        }
    }

@ -68,8 +70,8 @@ pub fn resolve_raw(column: usize, backticks: usize, text: &str) -> RawNode {
 fn split_at_lang_tag(raw: &str) -> (&str, &str) {
    let mut s = Scanner::new(raw);
    (
-        s.eat_until(|c| c == '`' || c.is_whitespace() || is_newline(c)),
-        s.rest(),
+        s.eat_until(|c: char| c == '`' || c.is_whitespace() || is_newline(c)),
+        s.after(),
    )
 }

@ -129,9 +131,9 @@ fn split_lines(text: &str) -> Vec<&str> {
            }

            lines.push(&text[start .. end]);
-            start = s.index();
+            start = s.cursor();
        }
-        end = s.index();
+        end = s.cursor();
    }

    lines.push(&text[start ..]);
--- a/src/parse/scanner.rs
+++ b/src/parse/scanner.rs
@ -1,211 +0,0 @@
-use std::slice::SliceIndex;
-
-use unicode_xid::UnicodeXID;
-
-/// A featureful char-based scanner.
-#[derive(Copy, Clone)]
-pub struct Scanner<'s> {
-    /// The string to scan.
-    src: &'s str,
-    /// The index at which the peekable character starts. Must be in bounds and
-    /// at a codepoint boundary to guarantee safety.
-    index: usize,
-}
-
-impl<'s> Scanner<'s> {
-    /// Create a new char scanner.
-    #[inline]
-    pub fn new(src: &'s str) -> Self {
-        Self { src, index: 0 }
-    }
-
-    /// Whether the end of the string is reached.
-    pub fn eof(&self) -> bool {
-        self.index == self.src.len()
-    }
-
-    /// Consume the next char.
-    #[inline]
-    pub fn eat(&mut self) -> Option<char> {
-        let next = self.peek();
-        if let Some(c) = next {
-            self.index += c.len_utf8();
-        }
-        next
-    }
-
-    /// Consume the next char if it is the given one.
-    ///
-    /// Returns whether the char was consumed.
-    #[inline]
-    pub fn eat_if(&mut self, c: char) -> bool {
-        let matches = self.peek() == Some(c);
-        if matches {
-            self.index += c.len_utf8();
-        }
-        matches
-    }
-
-    /// Consume the next char, debug-asserting that it is the given one.
-    #[inline]
-    pub fn eat_assert(&mut self, c: char) {
-        let next = self.eat();
-        debug_assert_eq!(next, Some(c));
-    }
-
-    /// Eat chars while the condition is true.
-    #[inline]
-    pub fn eat_while<F>(&mut self, mut f: F) -> &'s str
-    where
-        F: FnMut(char) -> bool,
-    {
-        self.eat_until(|c| !f(c))
-    }
-
-    /// Eat chars until the condition is true.
-    #[inline]
-    pub fn eat_until<F>(&mut self, mut f: F) -> &'s str
-    where
-        F: FnMut(char) -> bool,
-    {
-        let start = self.index;
-        while let Some(c) = self.peek() {
-            if f(c) {
-                break;
-            }
-            self.index += c.len_utf8();
-        }
-        self.eaten_from(start)
-    }
-
-    /// Uneat the last eaten char.
-    #[inline]
-    pub fn uneat(&mut self) {
-        self.index = self.last_index();
-    }
-
-    /// Peek at the next char without consuming it.
-    #[inline]
-    pub fn peek(&self) -> Option<char> {
-        self.rest().chars().next()
-    }
-
-    /// Get the nth-previous eaten char.
-    #[inline]
-    pub fn prev(&self, n: usize) -> Option<char> {
-        self.eaten().chars().nth_back(n)
-    }
-
-    /// Checks whether the next char fulfills a condition.
-    ///
-    /// Returns `default` if there is no next char.
-    #[inline]
-    pub fn check_or<F>(&self, default: bool, f: F) -> bool
-    where
-        F: FnOnce(char) -> bool,
-    {
-        self.peek().map_or(default, f)
-    }
-
-    /// The previous index in the source string.
-    #[inline]
-    pub fn last_index(&self) -> usize {
-        self.eaten().chars().last().map_or(0, |c| self.index - c.len_utf8())
-    }
-
-    /// The current index in the source string.
-    #[inline]
-    pub fn index(&self) -> usize {
-        self.index
-    }
-
-    /// Jump to an index in the source string.
-    #[inline]
-    pub fn jump(&mut self, index: usize) {
-        // Make sure that the index is in bounds and on a codepoint boundary.
-        self.src.get(index ..).expect("jumped to invalid index");
-        self.index = index;
-    }
-
-    /// The full source string.
-    #[inline]
-    pub fn src(&self) -> &'s str {
-        self.src
-    }
-
-    /// Slice out part of the source string.
-    #[inline]
-    pub fn get<I>(&self, index: I) -> &'s str
-    where
-        I: SliceIndex<str, Output = str>,
-    {
-        // See `eaten_from` for details about `unwrap_or_default`.
-        self.src.get(index).unwrap_or_default()
-    }
-
-    /// The remaining source string after the current index.
-    #[inline]
-    pub fn rest(&self) -> &'s str {
-        // Safety: The index is always in bounds and on a codepoint boundary
-        // since it starts at zero and is is:
-        // - either increased by the length of a scanned character, advacing
-        //   from one codepoint boundary to the next,
-        // - or checked upon jumping.
-        unsafe { self.src.get_unchecked(self.index ..) }
-    }
-
-    /// The full source string up to the current index.
-    #[inline]
-    pub fn eaten(&self) -> &'s str {
-        // Safety: The index is always okay, for details see `rest()`.
-        unsafe { self.src.get_unchecked(.. self.index) }
-    }
-
-    /// The source string from `start` to the current index.
-    #[inline]
-    pub fn eaten_from(&self, start: usize) -> &'s str {
-        // Using `unwrap_or_default` is much faster than unwrap, probably
-        // because then the whole call to `eaten_from` is pure and can be
-        // optimized away in some cases.
-        self.src.get(start .. self.index).unwrap_or_default()
-    }
-}
-
-/// Whether this character denotes a newline.
-#[inline]
-pub fn is_newline(character: char) -> bool {
-    matches!(
-        character,
-        // Line Feed, Vertical Tab, Form Feed, Carriage Return.
-        '\n' | '\x0B' | '\x0C' | '\r' |
-        // Next Line, Line Separator, Paragraph Separator.
-        '\u{0085}' | '\u{2028}' | '\u{2029}'
-    )
-}
-
-/// Whether a string is a valid unicode identifier.
-///
-/// In addition to what is specified in the [Unicode Standard][uax31], we allow:
-/// - `_` as a starting character,
-/// - `_` and `-` as continuing characters.
-///
-/// [uax31]: http://www.unicode.org/reports/tr31/
-#[inline]
-pub fn is_ident(string: &str) -> bool {
-    let mut chars = string.chars();
-    chars
-        .next()
-        .map_or(false, |c| is_id_start(c) && chars.all(is_id_continue))
-}
-
-/// Whether a character can start an identifier.
-#[inline]
-pub fn is_id_start(c: char) -> bool {
-    c.is_xid_start() || c == '_'
-}
-
-/// Whether a character can continue an identifier.
-#[inline]
-pub fn is_id_continue(c: char) -> bool {
-    c.is_xid_continue() || c == '_' || c == '-'
-}
--- a/src/parse/tokens.rs
+++ b/src/parse/tokens.rs
@ -1,9 +1,9 @@
 use std::sync::Arc;

-use super::{
-    is_id_continue, is_id_start, is_newline, resolve_hex, resolve_raw, resolve_string,
-    Scanner,
-};
+use unicode_xid::UnicodeXID;
+use unscanny::Scanner;
+
+use super::{resolve_hex, resolve_raw, resolve_string};
 use crate::geom::{AngleUnit, LengthUnit};
 use crate::syntax::ast::{MathNode, RawNode, Unit};
 use crate::syntax::{ErrorPos, NodeKind};
@ -65,13 +65,11 @@ impl<'s> Tokens<'s> {
    /// The index in the string at which the last token ends and next token
    /// will start.
    #[inline]
-    pub fn index(&self) -> usize {
-        self.s.index()
+    pub fn cursor(&self) -> usize {
+        self.s.cursor()
    }

    /// Jump to the given index in the string.
-    ///
-    /// You need to know the correct column.
    #[inline]
    pub fn jump(&mut self, index: usize) {
        self.s.jump(index);
@ -92,7 +90,7 @@ impl<'s> Tokens<'s> {
    /// The column index of a given index in the source string.
    #[inline]
    pub fn column(&self, index: usize) -> usize {
-        column(self.s.src(), index, self.column_offset)
+        column(self.s.string(), index, self.column_offset)
    }
 }

@ -102,7 +100,7 @@ impl<'s> Iterator for Tokens<'s> {
    /// Parse the next token in the source code.
    #[inline]
    fn next(&mut self) -> Option<Self::Item> {
-        let start = self.s.index();
+        let start = self.s.cursor();
        let c = self.s.eat()?;
        Some(match c {
            // Blocks.
@ -112,15 +110,13 @@ impl<'s> Iterator for Tokens<'s> {
            ']' => NodeKind::RightBracket,

            // Whitespace.
-            ' ' if self.s.check_or(true, |c| !c.is_whitespace()) => NodeKind::Space(0),
+            ' ' if self.s.done() || !self.s.at(char::is_whitespace) => NodeKind::Space(0),
            c if c.is_whitespace() => self.whitespace(),

            // Comments with special case for URLs.
            '/' if self.s.eat_if('*') => self.block_comment(),
            '/' if !self.maybe_in_url() && self.s.eat_if('/') => self.line_comment(),
-            '*' if self.s.eat_if('/') => {
-                NodeKind::Unknown(self.s.eaten_from(start).into())
-            }
+            '*' if self.s.eat_if('/') => NodeKind::Unknown(self.s.from(start).into()),

            // Other things.
            _ => match self.mode {
@ -187,22 +183,20 @@ impl<'s> Tokens<'s> {
            '=' => NodeKind::Eq,
            '<' => NodeKind::Lt,
            '>' => NodeKind::Gt,
-            '.' if self.s.check_or(true, |n| !n.is_ascii_digit()) => NodeKind::Dot,
+            '.' if self.s.done() || !self.s.at(char::is_ascii_digit) => NodeKind::Dot,

            // Identifiers.
            c if is_id_start(c) => self.ident(start),

            // Numbers.
-            c if c.is_ascii_digit()
-                || (c == '.' && self.s.check_or(false, |n| n.is_ascii_digit())) =>
-            {
+            c if c.is_ascii_digit() || (c == '.' && self.s.at(char::is_ascii_digit)) => {
                self.number(start, c)
            }

            // Strings.
            '"' => self.string(),

-            _ => NodeKind::Unknown(self.s.eaten_from(start).into()),
+            _ => NodeKind::Unknown(self.s.from(start).into()),
        }
    }

@ -226,19 +220,19 @@ impl<'s> Tokens<'s> {
        };

        loop {
-            self.s.eat_until(|c| {
+            self.s.eat_until(|c: char| {
                TABLE.get(c as usize).copied().unwrap_or_else(|| c.is_whitespace())
            });

            let mut s = self.s;
-            if !(s.eat_if(' ') && s.check_or(false, char::is_alphanumeric)) {
+            if !(s.eat_if(' ') && s.at(char::is_alphanumeric)) {
                break;
            }

            self.s.eat();
        }

-        NodeKind::Text(self.s.eaten_from(start).into())
+        NodeKind::Text(self.s.from(start).into())
    }

    fn whitespace(&mut self) -> NodeKind {
@ -276,13 +270,11 @@ impl<'s> Tokens<'s> {
            '[' | ']' | '{' | '}' | '#' |
            // Markup.
            '~' | '\'' | '"' | '*' | '_' | '`' | '$' | '=' | '-' | '.' => {
-                self.s.eat_assert(c) ;
+                self.s.expect(c);
                NodeKind::Escape(c)
            }
-            'u' if self.s.rest().starts_with("u{") => {
-                self.s.eat_assert('u');
-                self.s.eat_assert('{');
-                let sequence = self.s.eat_while(|c| c.is_ascii_alphanumeric());
+            'u' if self.s.eat_if("u{") => {
+                let sequence = self.s.eat_while(char::is_ascii_alphanumeric);
                if self.s.eat_if('}') {
                    if let Some(c) = resolve_hex(sequence) {
                        NodeKind::Escape(c)
@ -304,7 +296,7 @@ impl<'s> Tokens<'s> {
            // Linebreaks.
            c if c.is_whitespace() => NodeKind::Linebreak(false),
            '+' => {
-                self.s.eat_assert(c);
+                self.s.expect(c);
                NodeKind::Linebreak(true)
            }

@ -315,7 +307,7 @@ impl<'s> Tokens<'s> {

    #[inline]
    fn hash(&mut self) -> NodeKind {
-        if self.s.check_or(false, is_id_start) {
+        if self.s.at(is_id_start) {
            let read = self.s.eat_while(is_id_continue);
            match keyword(read) {
                Some(keyword) => keyword,
@ -342,10 +334,10 @@ impl<'s> Tokens<'s> {

    fn numbering(&mut self, start: usize, c: char) -> NodeKind {
        let number = if c != '.' {
-            self.s.eat_while(|c| c.is_ascii_digit());
-            let read = self.s.eaten_from(start);
+            self.s.eat_while(char::is_ascii_digit);
+            let read = self.s.from(start);
            if !self.s.eat_if('.') {
-                return NodeKind::Text(self.s.eaten_from(start).into());
+                return NodeKind::Text(self.s.from(start).into());
            }
            read.parse().ok()
        } else {
@ -356,7 +348,7 @@ impl<'s> Tokens<'s> {
    }

    fn raw(&mut self) -> NodeKind {
-        let column = self.column(self.s.index() - 1);
+        let column = self.column(self.s.cursor() - 1);

        let mut backticks = 1;
        while self.s.eat_if('`') {
@ -372,7 +364,7 @@ impl<'s> Tokens<'s> {
            }));
        }

-        let start = self.s.index();
+        let start = self.s.cursor();

        let mut found = 0;
        while found < backticks {
@ -384,7 +376,7 @@ impl<'s> Tokens<'s> {
        }

        if found == backticks {
-            let end = self.s.index() - found as usize;
+            let end = self.s.cursor() - found as usize;
            NodeKind::Raw(Arc::new(resolve_raw(
                column,
                backticks,
@ -412,7 +404,7 @@ impl<'s> Tokens<'s> {
            display = true;
        }

-        let start = self.s.index();
+        let start = self.s.cursor();

        let mut escaped = false;
        let mut dollar = !display;
@ -429,7 +421,7 @@ impl<'s> Tokens<'s> {
            }
        };

-        let end = self.s.index()
+        let end = self.s.cursor()
            - match (terminated, display) {
                (false, _) => 0,
                (true, false) => 1,
@ -456,7 +448,7 @@ impl<'s> Tokens<'s> {

    fn ident(&mut self, start: usize) -> NodeKind {
        self.s.eat_while(is_id_continue);
-        match self.s.eaten_from(start) {
+        match self.s.from(start) {
            "none" => NodeKind::None,
            "auto" => NodeKind::Auto,
            "true" => NodeKind::Bool(true),
@ -467,30 +459,29 @@ impl<'s> Tokens<'s> {

    fn number(&mut self, start: usize, c: char) -> NodeKind {
        // Read the first part (integer or fractional depending on `first`).
-        self.s.eat_while(|c| c.is_ascii_digit());
+        self.s.eat_while(char::is_ascii_digit);

        // Read the fractional part if not already done.
        // Make sure not to confuse a range for the decimal separator.
-        if c != '.' && !self.s.rest().starts_with("..") && self.s.eat_if('.') {
-            self.s.eat_while(|c| c.is_ascii_digit());
+        if c != '.' && !self.s.at("..") && self.s.eat_if('.') {
+            self.s.eat_while(char::is_ascii_digit);
        }

        // Read the exponent.
-        let em = self.s.rest().starts_with("em");
-        if !em && self.s.eat_if('e') || self.s.eat_if('E') {
-            let _ = self.s.eat_if('+') || self.s.eat_if('-');
-            self.s.eat_while(|c| c.is_ascii_digit());
+        if !self.s.at("em") && self.s.eat_if(['e', 'E']) {
+            self.s.eat_if(['+', '-']);
+            self.s.eat_while(char::is_ascii_digit);
        }

        // Read the suffix.
-        let suffix_start = self.s.index();
+        let suffix_start = self.s.cursor();
        if !self.s.eat_if('%') {
-            self.s.eat_while(|c| c.is_ascii_alphanumeric());
+            self.s.eat_while(char::is_ascii_alphanumeric);
        }

        let number = self.s.get(start .. suffix_start);
-        let suffix = self.s.eaten_from(suffix_start);
-        let all = self.s.eaten_from(start);
+        let suffix = self.s.from(suffix_start);
+        let all = self.s.from(start);

        // Find out whether it is a simple number.
        if suffix.is_empty() {
@ -575,13 +566,13 @@ impl<'s> Tokens<'s> {

    fn in_word(&self) -> bool {
        let alphanumeric = |c: Option<char>| c.map_or(false, |c| c.is_alphanumeric());
-        let prev = self.s.prev(1);
+        let prev = self.s.scout(-2);
        let next = self.s.peek();
        alphanumeric(prev) && alphanumeric(next)
    }

    fn maybe_in_url(&self) -> bool {
-        self.mode == TokenMode::Markup && self.s.eaten().ends_with(":/")
+        self.mode == TokenMode::Markup && self.s.before().ends_with(":/")
    }
 }

@ -610,7 +601,8 @@ fn keyword(ident: &str) -> Option<NodeKind> {
    })
 }

-/// The column index of a given index in the source string, given a column offset for the first line.
+/// The column index of a given index in the source string, given a column
+/// offset for the first line.
 #[inline]
 fn column(string: &str, index: usize, offset: usize) -> usize {
    let mut apply_offset = false;
@ -634,6 +626,45 @@ fn column(string: &str, index: usize, offset: usize) -> usize {
    if apply_offset { res + offset } else { res }
 }

+/// Whether this character denotes a newline.
+#[inline]
+pub fn is_newline(character: char) -> bool {
+    matches!(
+        character,
+        // Line Feed, Vertical Tab, Form Feed, Carriage Return.
+        '\n' | '\x0B' | '\x0C' | '\r' |
+        // Next Line, Line Separator, Paragraph Separator.
+        '\u{0085}' | '\u{2028}' | '\u{2029}'
+    )
+}
+
+/// Whether a string is a valid unicode identifier.
+///
+/// In addition to what is specified in the [Unicode Standard][uax31], we allow:
+/// - `_` as a starting character,
+/// - `_` and `-` as continuing characters.
+///
+/// [uax31]: http://www.unicode.org/reports/tr31/
+#[inline]
+pub fn is_ident(string: &str) -> bool {
+    let mut chars = string.chars();
+    chars
+        .next()
+        .map_or(false, |c| is_id_start(c) && chars.all(is_id_continue))
+}
+
+/// Whether a character can start an identifier.
+#[inline]
+pub fn is_id_start(c: char) -> bool {
+    c.is_xid_start() || c == '_'
+}
+
+/// Whether a character can continue an identifier.
+#[inline]
+pub fn is_id_continue(c: char) -> bool {
+    c.is_xid_continue() || c == '_' || c == '-'
+}
+
 #[cfg(test)]
 #[allow(non_snake_case)]
 mod tests {
--- a/src/source.rs
+++ b/src/source.rs
@ -6,9 +6,11 @@ use std::ops::Range;
 use std::path::{Path, PathBuf};
 use std::sync::Arc;

+use unscanny::Scanner;
+
 use crate::diag::TypResult;
 use crate::loading::{FileHash, Loader};
-use crate::parse::{is_newline, parse, Reparser, Scanner};
+use crate::parse::{is_newline, parse, Reparser};
 use crate::syntax::ast::Markup;
 use crate::syntax::{self, Category, GreenNode, RedNode};
 use crate::util::{PathExt, StrExt};
@ -382,12 +384,12 @@ impl Line {
        let mut utf16_idx = utf16_offset;

        std::iter::from_fn(move || {
-            s.eat_until(|c| {
+            s.eat_until(|c: char| {
                utf16_idx += c.len_utf16();
                is_newline(c)
            });

-            if s.eof() {
+            if s.done() {
                return None;
            }

@ -396,7 +398,7 @@ impl Line {
            }

            Some(Line {
-                byte_idx: byte_offset + s.index(),
+                byte_idx: byte_offset + s.cursor(),
                utf16_idx,
            })
        })
--- a/tests/typeset.rs
+++ b/tests/typeset.rs
@ -6,6 +6,7 @@ use std::path::Path;
 use std::sync::Arc;

 use tiny_skia as sk;
+use unscanny::Scanner;
 use walkdir::WalkDir;

 use typst::diag::Error;
@ -15,7 +16,6 @@ use typst::geom::{Length, RgbaColor};
 use typst::library::layout::PageNode;
 use typst::library::text::{TextNode, TextSize};
 use typst::loading::FsLoader;
-use typst::parse::Scanner;
 use typst::source::SourceFile;
 use typst::syntax::Span;
 use typst::{bail, Context};
@ -329,7 +329,7 @@ fn parse_metadata(source: &SourceFile) -> (Option<bool>, Vec<Error>) {
        };

        fn num(s: &mut Scanner) -> usize {
-            s.eat_while(|c| c.is_numeric()).parse().unwrap()
+            s.eat_while(char::is_numeric).parse().unwrap()
        }

        let comments =
@ -348,7 +348,7 @@ fn parse_metadata(source: &SourceFile) -> (Option<bool>, Vec<Error>) {
        let end = if s.eat_if('-') { pos(&mut s) } else { start };
        let span = Span::new(source.id(), start, end);

-        errors.push(Error::new(span, s.rest().trim()));
+        errors.push(Error::new(span, s.after().trim()));
    }

    (compare_ref, errors)