Re-engineer tokenization 🚿

This commit is contained in:
Laurenz 2020-01-11 10:11:14 +01:00
parent a75ddd2c93
commit b1e956419d
11 changed files with 398 additions and 1449 deletions

View File

@ -15,7 +15,7 @@ pub mod prelude {
pub use crate::func::{Scope, ParseFunc, LayoutFunc, Command, Commands};
pub use crate::layout::prelude::*;
pub use crate::syntax::{
parse, ParseContext, ParseResult,
ParseContext, ParseResult,
SyntaxTree, FuncCall, FuncArgs, PosArg, KeyArg,
Expression, Ident, ExpressionKind,
Spanned, Span

View File

@ -297,9 +297,10 @@ function! {
parse!(forbidden: body);
if let Some(name) = args.get_pos_opt::<Ident>()? {
let flip = args.get_key_opt::<bool>("flip")?
.unwrap_or(false);
PageSizeFunc::Paper(Paper::from_name(name.as_str())?, flip)
let flip = args.get_key_opt::<bool>("flip")?.unwrap_or(false);
let paper = Paper::from_name(name.as_str())
.ok_or_else(|| error!(@"invalid paper name: `{}`", name))?;
PageSizeFunc::Paper(paper, flip)
} else {
PageSizeFunc::Custom(ExtentMap::new(&mut args, true)?)
}

View File

@ -72,7 +72,7 @@ impl Size {
impl Display for Size {
fn fmt(&self, f: &mut Formatter) -> fmt::Result {
write!(f, "{}cm", self.to_cm())
write!(f, "{}pt", self.points)
}
}

View File

@ -3,7 +3,6 @@
use toddle::query::{FontFallbackTree, FontVariant, FontStyle, FontWeight};
use crate::size::{Size, Size2D, SizeBox, ValueBox, PSize};
use crate::syntax::ParseResult;
/// Defines properties of pages and text.
@ -157,7 +156,7 @@ pub struct Paper {
impl Paper {
/// The paper with the given name.
pub fn from_name(name: &str) -> ParseResult<Paper> {
pub fn from_name(name: &str) -> Option<Paper> {
parse_paper(name)
}
}
@ -193,11 +192,11 @@ macro_rules! papers {
class: $class,
};)*
fn parse_paper(paper: &str) -> ParseResult<Paper> {
Ok(match paper.to_lowercase().as_str() {
$($($patterns)* => $var,)*
_ => error!("unknown paper size: `{}`", paper),
})
fn parse_paper(paper: &str) -> Option<Paper> {
match paper.to_lowercase().as_str() {
$($($patterns)* => Some($var),)*
_ => None,
}
}
};
}

View File

@ -11,48 +11,6 @@ pub_use_mod!(parsing);
pub_use_mod!(span);
/// A logical unit of the incoming text stream.
#[derive(Debug, Copy, Clone, Eq, PartialEq)]
pub enum Token<'s> {
/// One or more whitespace (non-newline) codepoints.
Space,
/// A line feed (`\n`, `\r\n` and some more as defined by the Unicode standard).
Newline,
/// A left bracket: `[`.
LeftBracket,
/// A right bracket: `]`.
RightBracket,
/// A colon (`:`) indicating the beginning of function arguments (Function
/// header only).
///
/// If a colon occurs outside of a function header, it will be tokenized as
/// [Text](Token::Text), just like the other tokens annotated with
/// _Header only_.
Colon,
/// An equals (`=`) sign assigning a function argument a value (Header only).
Equals,
/// A comma (`,`) separating two function arguments (Header only).
Comma,
/// Quoted text as a string value (Header only).
Quoted(&'s str),
/// An underscore, indicating text in italics (Body only).
Underscore,
/// A star, indicating bold text (Body only).
Star,
/// A backtick, indicating monospace text (Body only).
Backtick,
/// A line comment.
LineComment(&'s str),
/// A block comment.
BlockComment(&'s str),
/// A star followed by a slash unexpectedly ending a block comment
/// (the comment was not started before, otherwise a
/// [BlockComment](Token::BlockComment) would be returned).
StarSlash,
/// Any consecutive string which does not contain markup.
Text(&'s str),
}
/// A tree representation of source code.
#[derive(Debug, PartialEq)]
pub struct SyntaxTree {
@ -256,11 +214,11 @@ debug_display!(Expression);
pub struct Ident(pub String);
impl Ident {
pub fn new(string: String) -> ParseResult<Ident> {
if is_identifier(&string) {
Ok(Ident(string))
pub fn new<S>(ident: S) -> Option<Ident> where S: AsRef<str> + Into<String> {
if is_identifier(ident.as_ref()) {
Some(Ident(ident.into()))
} else {
error!("invalid identifier: `{}`", string);
None
}
}
@ -277,20 +235,20 @@ impl Display for Ident {
debug_display!(Ident);
/// Whether this word is a valid unicode identifier.
/// Whether this word is a valid identifier.
fn is_identifier(string: &str) -> bool {
let mut chars = string.chars();
match chars.next() {
Some('-') => (),
Some(c) if UnicodeXID::is_xid_start(c) => (),
Some('-') => {}
Some(c) if UnicodeXID::is_xid_start(c) => {}
_ => return false,
}
while let Some(c) = chars.next() {
match c {
'.' | '-' => (),
c if UnicodeXID::is_xid_continue(c) => (),
'.' | '-' => {}
c if UnicodeXID::is_xid_continue(c) => {}
_ => return false,
}
}

View File

@ -1,7 +1,4 @@
//! Parsing of token streams into syntax trees.
use crate::func::Scope;
use crate::size::Size;
use super::*;
@ -10,7 +7,7 @@ pub type ParseResult<T> = crate::TypesetResult<T>;
/// Parses source code into a syntax tree given a context.
pub fn parse(src: &str, ctx: ParseContext) -> ParseResult<SyntaxTree> {
Parser::new(src, ctx).parse()
unimplemented!()
}
/// The context for parsing.
@ -19,833 +16,3 @@ pub struct ParseContext<'a> {
/// The scope containing function definitions.
pub scope: &'a Scope,
}
/// Transforms token streams into syntax trees.
#[derive(Debug)]
struct Parser<'s> {
src: &'s str,
tokens: PeekableTokens<'s>,
ctx: ParseContext<'s>,
tree: SyntaxTree,
color_tokens: Vec<Spanned<ColorToken>>,
}
#[derive(Debug, Copy, Clone, Eq, PartialEq)]
enum NewlineState {
/// No newline yet.
Zero,
/// We saw one newline with the given span already and are
/// looking for another.
One(Span),
/// We saw at least two newlines and wrote one, thus not
/// writing another one for more newlines.
TwoOrMore,
}
impl<'s> Parser<'s> {
/// Create a new parser from the source code and the context.
fn new(src: &'s str, ctx: ParseContext<'s>) -> Parser<'s> {
Parser {
src,
tokens: PeekableTokens::new(tokenize(src)),
ctx,
tree: SyntaxTree::new(),
color_tokens: vec![],
}
}
/// Parse the source into a syntax tree.
fn parse(mut self) -> ParseResult<SyntaxTree> {
while self.tokens.peek().is_some() {
self.parse_white()?;
self.parse_body_part()?;
}
Ok(self.tree)
}
/// Parse the next part of the body.
fn parse_body_part(&mut self) -> ParseResult<()> {
use Token::*;
if let Some(token) = self.tokens.peek() {
match token.v {
// Functions.
LeftBracket => self.parse_func()?,
RightBracket => error!("unexpected closing bracket"),
// Modifiers.
Underscore => self.add_consumed(Node::ToggleItalics, token.span),
Star => self.add_consumed(Node::ToggleBolder, token.span),
Backtick => self.add_consumed(Node::ToggleMonospace, token.span),
// Normal text.
Text(word) => self.add_consumed(Node::Text(word.to_owned()), token.span),
// The rest is handled elsewhere or should not happen, because
// the tokenizer does not yield these in a body.
Space | Newline | LineComment(_) | BlockComment(_) |
Colon | Equals | Comma | Quoted(_) | StarSlash
=> panic!("parse_body_part: unexpected token: {:?}", token),
}
}
Ok(())
}
/// Parse a complete function from the current position.
fn parse_func(&mut self) -> ParseResult<()> {
// This should only be called if a left bracket was seen.
let token = self.tokens.next().expect("parse_func: expected token");
assert!(token.v == Token::LeftBracket);
self.add_color_token(ColorToken::Bracket, token.span);
let mut span = token.span;
let name = self.parse_func_name()?;
// Check for arguments
let args = match self.tokens.next() {
Some(Spanned { v: Token::RightBracket, span }) => {
self.add_color_token(ColorToken::Bracket, span);
FuncArgs::new()
},
Some(Spanned { v: Token::Colon, span }) => {
self.add_color_token(ColorToken::Colon, span);
self.parse_func_args()?
}
_ => error!("expected arguments or closing bracket"),
};
span.end = self.tokens.get_position();
let (func, body_span) = self.parse_func_call(name, args)?;
if let Some(body_span) = body_span {
span.expand(body_span);
}
// Finally this function is parsed to the end.
self.add(Node::Func(func), span);
Ok(())
}
/// Parse a function header.
fn parse_func_name(&mut self) -> ParseResult<Spanned<Ident>> {
self.skip_white();
let name = match self.tokens.next() {
Some(Spanned { v: Token::Text(word), span }) => {
let ident = Ident::new(word.to_string())?;
Spanned::new(ident, span)
}
_ => error!("expected identifier"),
};
self.add_color_token(ColorToken::FuncName, name.span);
self.skip_white();
Ok(name)
}
/// Parse the arguments to a function.
fn parse_func_args(&mut self) -> ParseResult<FuncArgs> {
let mut args = FuncArgs::new();
loop {
self.skip_white();
match self.parse_func_arg()? {
Some(DynArg::Pos(arg)) => args.add_pos(arg),
Some(DynArg::Key(arg)) => args.add_key(arg),
None => {},
}
match self.tokens.next() {
Some(Spanned { v: Token::Comma, span }) => {
self.add_color_token(ColorToken::Comma, span);
}
Some(Spanned { v: Token::RightBracket, span }) => {
self.add_color_token(ColorToken::Bracket, span);
break;
}
_ => error!("expected comma or closing bracket"),
}
}
Ok(args)
}
/// Parse one argument to a function.
fn parse_func_arg(&mut self) -> ParseResult<Option<DynArg>> {
let token = match self.tokens.peek() {
Some(token) => token,
None => return Ok(None),
};
Ok(match token.v {
Token::Text(name) => {
self.advance();
self.skip_white();
Some(match self.tokens.peek() {
Some(Spanned { v: Token::Equals, span }) => {
self.advance();
self.skip_white();
let name = Ident::new(name.to_string())?;
let key = Spanned::new(name, token.span);
self.add_color_token(ColorToken::KeyArg, key.span);
self.add_color_token(ColorToken::Equals, span);
let next = self.tokens.next()
.ok_or_else(|| error!(@"expected expression"))?;
let value = Self::parse_expression(next)?;
self.add_expr_token(&value);
let span = Span::merge(key.span, value.span);
let arg = KeyArg { key, value };
DynArg::Key(Spanned::new(arg, span))
}
_ => {
let expr = Self::parse_expression(token)?;
self.add_expr_token(&expr);
DynArg::Pos(expr)
}
})
}
Token::Quoted(_) => {
self.advance();
self.skip_white();
self.add_color_token(ColorToken::ExprStr, token.span);
Some(DynArg::Pos(Self::parse_expression(token)?))
}
_ => None,
})
}
/// Parse a function call.
fn parse_func_call(&mut self, name: Spanned<Ident>, args: FuncArgs)
-> ParseResult<(FuncCall, Option<Span>)> {
// Now we want to parse this function dynamically.
let parser = self
.ctx
.scope
.get_parser(&name.v.0)
.ok_or_else(|| error!(@"unknown function: `{}`", &name.v))?;
let has_body = self.tokens.peek().map(Spanned::value) == Some(Token::LeftBracket);
// Do the parsing dependent on whether the function has a body.
Ok(if has_body {
self.advance();
// Find out the string which makes the body of this function.
let start_index = self.tokens.string_index();
let mut start_pos = self.tokens.get_position();
start_pos.column -= 1;
let (mut end_index, mut end_pos) =
find_closing_bracket(&self.src[start_index..])
.ok_or_else(|| error!(@"expected closing bracket"))?;
end_index += start_index;
end_pos.column += 1;
let span = Span::new(start_pos, end_pos);
// Parse the body.
let body_string = &self.src[start_index..end_index];
let body = parser(args, Some(body_string), self.ctx)?;
// Skip to the end of the function in the token stream.
self.tokens.set_string_index(end_index);
// Now the body should be closed.
let token = self.tokens.next().expect("parse_func_body: expected token");
assert!(token.v == Token::RightBracket);
(FuncCall(body), Some(span))
} else {
(FuncCall(parser(args, None, self.ctx)?), None)
})
}
/// Parse an expression.
fn parse_expression(token: Spanned<Token>) -> ParseResult<Spanned<Expression>> {
Ok(Spanned::new(match token.v {
Token::Quoted(text) => Expression::Str(text.to_owned()),
Token::Text(text) => {
if let Ok(b) = text.parse::<bool>() {
Expression::Bool(b)
} else if let Ok(num) = text.parse::<f64>() {
Expression::Num(num)
} else if let Ok(size) = text.parse::<Size>() {
Expression::Size(size)
} else {
// This loop does not actually loop, but is used for breaking.
loop {
if text.ends_with('%') {
if let Ok(percent) = text[.. text.len()-1].parse::<f64>() {
break Expression::Num(percent / 100.0);
}
}
break Expression::Ident(Ident::new(text.to_string())?);
}
}
}
_ => error!("expected expression"),
}, token.span))
}
/// Parse whitespace (as long as there is any) and skip over comments.
fn parse_white(&mut self) -> ParseResult<()> {
let mut state = NewlineState::Zero;
while let Some(token) = self.tokens.peek() {
match token.v {
Token::Space => {
self.advance();
match state {
NewlineState::Zero | NewlineState::TwoOrMore => {
self.add_space(token.span);
}
_ => {}
}
}
Token::Newline => {
self.advance();
match state {
NewlineState::Zero => state = NewlineState::One(token.span),
NewlineState::One(span) => {
self.add(Node::Newline, Span::merge(span, token.span));
state = NewlineState::TwoOrMore;
},
NewlineState::TwoOrMore => self.add_space(token.span),
}
}
_ => {
if let NewlineState::One(span) = state {
self.add_space(Span::new(span.start, token.span.start));
}
state = NewlineState::Zero;
match token.v {
Token::LineComment(_) | Token::BlockComment(_) => self.advance(),
Token::StarSlash => error!("unexpected end of block comment"),
_ => break,
}
}
}
}
Ok(())
}
/// Skip over whitespace and comments.
fn skip_white(&mut self) {
while let Some(token) = self.tokens.peek() {
match token.v {
Token::Space | Token::Newline |
Token::LineComment(_) | Token::BlockComment(_) => self.advance(),
_ => break,
}
}
}
/// Advance the iterator by one step.
fn advance(&mut self) {
self.tokens.next();
}
/// Append a node to the tree.
fn add(&mut self, node: Node, span: Span) {
self.tree.nodes.push(Spanned::new(node, span));
}
/// Append a space, merging with a previous space if there is one.
fn add_space(&mut self, span: Span) {
match self.tree.nodes.last_mut() {
Some(ref mut node) if node.v == Node::Space => node.span.expand(span),
_ => self.add(Node::Space, span),
}
}
/// Advance and return the given node.
fn add_consumed(&mut self, node: Node, span: Span) {
self.advance();
self.add(node, span);
}
/// Add a color token to the list.
fn add_color_token(&mut self, token: ColorToken, span: Span) {
self.color_tokens.push(Spanned::new(token, span));
}
/// Add a color token for an expression.
fn add_expr_token(&mut self, expr: &Spanned<Expression>) {
let kind = match expr.v {
Expression::Bool(_) => ColorToken::ExprBool,
Expression::Ident(_) => ColorToken::ExprIdent,
Expression::Num(_) => ColorToken::ExprNumber,
Expression::Size(_) => ColorToken::ExprSize,
Expression::Str(_) => ColorToken::ExprStr,
};
self.add_color_token(kind, expr.span);
}
}
/// Find the index of the first unbalanced and unescaped closing bracket.
fn find_closing_bracket(src: &str) -> Option<(usize, Position)> {
let mut parens = 0;
let mut escaped = false;
let mut line = 1;
let mut line_start_index = 0;
for (index, c) in src.char_indices() {
match c {
'\\' => {
escaped = !escaped;
continue;
}
c if is_newline_char(c) => {
line += 1;
line_start_index = index + c.len_utf8();
}
']' if !escaped && parens == 0 => {
let position = Position {
line,
column: index - line_start_index,
};
return Some((index, position))
}
'[' if !escaped => parens += 1,
']' if !escaped => parens -= 1,
_ => {}
}
escaped = false;
}
None
}
/// A peekable iterator for tokens which allows access to the original iterator
/// inside this module (which is needed by the parser).
#[derive(Debug, Clone)]
struct PeekableTokens<'s> {
tokens: Tokens<'s>,
peeked: Option<Option<Spanned<Token<'s>>>>,
}
impl<'s> PeekableTokens<'s> {
/// Create a new iterator from a string.
fn new(tokens: Tokens<'s>) -> PeekableTokens<'s> {
PeekableTokens {
tokens,
peeked: None,
}
}
/// Peek at the next element.
fn peek(&mut self) -> Option<Spanned<Token<'s>>> {
let iter = &mut self.tokens;
*self.peeked.get_or_insert_with(|| iter.next())
}
fn get_position(&self) -> Position {
match self.peeked {
Some(Some(peeked)) => peeked.span.start,
_ => self.tokens.get_position(),
}
}
fn string_index(&self) -> usize {
match self.peeked {
Some(Some(peeked)) => peeked.span.start.line,
_ => self.tokens.string_index(),
}
}
fn set_string_index(&mut self, index: usize) {
self.tokens.set_string_index(index);
self.peeked = None;
}
}
impl<'s> Iterator for PeekableTokens<'s> {
type Item = Spanned<Token<'s>>;
fn next(&mut self) -> Option<Self::Item> {
match self.peeked.take() {
Some(value) => value,
None => self.tokens.next(),
}
}
}
#[cfg(test)]
#[allow(non_snake_case)]
mod tests {
use crate::func::{Commands, Scope};
use crate::layout::{LayoutContext, LayoutResult};
use crate::syntax::*;
use Node::{Func as F, Newline as N, Space as S};
function! {
/// A testing function which just parses it's body into a syntax
/// tree.
#[derive(Debug)]
pub struct TreeFn { pub tree: SyntaxTree }
parse(args, body, ctx) {
args.clear();
TreeFn {
tree: parse!(expected: body, ctx)
}
}
layout() { vec![] }
}
impl PartialEq for TreeFn {
fn eq(&self, other: &TreeFn) -> bool {
assert_tree_equal(&self.tree, &other.tree);
true
}
}
function! {
/// A testing function without a body.
#[derive(Debug, Default, PartialEq)]
pub struct BodylessFn(Vec<Expression>, Vec<(Ident, Expression)>);
parse(args, body) {
parse!(forbidden: body);
BodylessFn(
args.pos().map(Spanned::value).collect(),
args.keys().map(|arg| (arg.v.key.v, arg.v.value.v)).collect(),
)
}
layout() { vec![] }
}
mod args {
use super::*;
use super::Expression;
pub use Expression::{Num as N, Size as Z, Bool as B};
pub fn S(string: &str) -> Expression { Expression::Str(string.to_owned()) }
pub fn I(string: &str) -> Expression {
Expression::Ident(Ident::new(string.to_owned()).unwrap())
}
}
/// Asserts that two syntax trees are equal except for all spans inside them.
fn assert_tree_equal(a: &SyntaxTree, b: &SyntaxTree) {
for (x, y) in a.nodes.iter().zip(&b.nodes) {
if x.v != y.v {
panic!("trees are not equal: ({:#?}) != ({:#?})", x.v, y.v);
}
}
}
/// Test if the source code parses into the syntax tree.
fn test(src: &str, tree: SyntaxTree) {
let ctx = ParseContext {
scope: &Scope::new(),
};
assert_tree_equal(&parse(src, ctx).unwrap(), &tree);
}
/// Test with a scope containing function definitions.
fn test_scoped(scope: &Scope, src: &str, tree: SyntaxTree) {
let ctx = ParseContext { scope };
assert_tree_equal(&parse(src, ctx).unwrap(), &tree);
}
/// Test if the source parses into the error.
fn test_err(src: &str, err: &str) {
let ctx = ParseContext {
scope: &Scope::new(),
};
assert_eq!(parse(src, ctx).unwrap_err().to_string(), err);
}
/// Test with a scope if the source parses into the error.
fn test_err_scoped(scope: &Scope, src: &str, err: &str) {
let ctx = ParseContext { scope };
assert_eq!(parse(src, ctx).unwrap_err().to_string(), err);
}
fn test_color(scope: &Scope, src: &str, tokens: Vec<(usize, usize, ColorToken)>) {
let ctx = ParseContext { scope };
let tree = parse(src, ctx).unwrap();
// assert_eq!(tree.tokens,
// tokens.into_iter()
// .map(|(s, e, t)| Spanned::new(t, Span::new(s, e)))
// .collect::<Vec<_>>()
// );
}
/// Create a text node.
fn T(s: &str) -> Node {
Node::Text(s.to_owned())
}
fn zerospan<T>(val: T) -> Spanned<T> {
Spanned::new(val, Span::new(Position::new(0, 0), Position::new(0, 0)))
}
/// Shortcut macro to create a syntax tree. Is `vec`-like and the elements
/// are the nodes without spans.
macro_rules! tree {
($($x:expr),*) => ({
#[allow(unused_mut)] let mut nodes = vec![];
$(
nodes.push(zerospan($x));
)*
SyntaxTree { nodes }
});
($($x:expr,)*) => (tree![$($x),*])
}
/// Shortcut macro to create a function.
macro_rules! func {
() => (
FuncCall(Box::new(BodylessFn(vec![], vec![])))
);
(body: $tree:expr $(,)*) => (
FuncCall(Box::new(TreeFn { tree: $tree }))
);
(args: $pos:expr, $key:expr) => (
FuncCall(Box::new(BodylessFn($pos, $key)))
);
}
/// Parse the basic cases.
#[test]
#[rustfmt::skip]
fn parse_base() {
test("", tree! []);
test("Hello World!", tree! [ T("Hello"), S, T("World!") ]);
}
/// Test whether newlines generate the correct whitespace.
#[test]
#[rustfmt::skip]
fn parse_newlines_whitespace() {
test("Hello\nWorld", tree! [ T("Hello"), S, T("World") ]);
test("Hello \n World", tree! [ T("Hello"), S, T("World") ]);
test("Hello\n\nWorld", tree! [ T("Hello"), N, T("World") ]);
test("Hello \n\nWorld", tree! [ T("Hello"), S, N, T("World") ]);
test("Hello\n\n World", tree! [ T("Hello"), N, S, T("World") ]);
test("Hello \n \n \n World", tree! [ T("Hello"), S, N, S, T("World") ]);
test("Hello\n \n\n World", tree! [ T("Hello"), N, S, T("World") ]);
test("Hello\n \nWorld", tree! [ T("Hello"), N, T("World") ]);
}
/// Parse things dealing with functions.
#[test]
#[rustfmt::skip]
fn parse_functions() {
let mut scope = Scope::new();
scope.add::<BodylessFn>("test");
scope.add::<BodylessFn>("end");
scope.add::<TreeFn>("modifier");
scope.add::<TreeFn>("func");
test_scoped(&scope,"[test]", tree! [ F(func! {}) ]);
test_scoped(&scope,"[ test]", tree! [ F(func! {}) ]);
test_scoped(&scope, "This is an [modifier][example] of a function invocation.", tree! [
T("This"), S, T("is"), S, T("an"), S,
F(func! { body: tree! [ T("example") ] }), S,
T("of"), S, T("a"), S, T("function"), S, T("invocation.")
]);
test_scoped(&scope, "[func][Hello][modifier][Here][end]", tree! [
F(func! { body: tree! [ T("Hello") ] }),
F(func! { body: tree! [ T("Here") ] }),
F(func! {}),
]);
test_scoped(&scope, "[func][]", tree! [ F(func! { body: tree! [] }) ]);
test_scoped(&scope, "[modifier][[func][call]] outside", tree! [
F(func! { body: tree! [ F(func! { body: tree! [ T("call") ] }) ] }), S, T("outside")
]);
}
/// Parse functions with arguments.
#[test]
#[rustfmt::skip]
fn parse_function_args() {
use args::*;
fn func(
pos: Vec<Expression>,
key: Vec<(&str, Expression)>,
) -> SyntaxTree {
let key = key.into_iter()
.map(|s| (Ident::new(s.0.to_string()).unwrap(), s.1))
.collect();
tree! [ F(func!(args: pos, key)) ]
}
let mut scope = Scope::new();
scope.add::<BodylessFn>("align");
test_scoped(&scope, "[align: left]", func(vec![I("left")], vec![]));
test_scoped(&scope, "[align: left,right]", func(vec![I("left"), I("right")], vec![]));
test_scoped(&scope, "[align: left, right]", func(vec![I("left"), I("right")], vec![]));
test_scoped(&scope, "[align: \"hello\"]", func(vec![S("hello")], vec![]));
test_scoped(&scope, r#"[align: "hello\"world"]"#, func(vec![S(r#"hello\"world"#)], vec![]));
test_scoped(&scope, "[align: 12]", func(vec![N(12.0)], vec![]));
test_scoped(&scope, "[align: 17.53pt]", func(vec![Z(Size::pt(17.53))], vec![]));
test_scoped(&scope, "[align: 2.4in]", func(vec![Z(Size::inches(2.4))], vec![]));
test_scoped(&scope, "[align: true, 10mm, left, \"hi, there\"]",
func(vec![B(true), Z(Size::mm(10.0)), I("left"), S("hi, there")], vec![]));
test_scoped(&scope, "[align: right=true]", func(vec![], vec![("right", B(true))]));
test_scoped(&scope, "[align: flow = horizontal]",
func(vec![], vec![("flow", I("horizontal"))]));
test_scoped(&scope, "[align: x=1cm, y=20mm]",
func(vec![], vec![("x", Z(Size::cm(1.0))), ("y", Z(Size::mm(20.0)))]));
test_scoped(&scope, "[align: x=5.14,a, \"b\", c=me,d=you]",
func(vec![I("a"), S("b")], vec![("x", N(5.14)), ("c", I("me")), ("d", I("you"))]));
}
/// Parse comments (line and block).
#[test]
#[rustfmt::skip]
fn parse_comments() {
let mut scope = Scope::new();
scope.add::<BodylessFn>("test");
scope.add::<TreeFn>("func");
test_scoped(&scope, "Text\n// Comment\n More text",
tree! [ T("Text"), S, T("More"), S, T("text") ]);
test_scoped(&scope, "[test/*world*/]",
tree! [ F(func! {}) ]);
test_scoped(&scope, "[test/*]*/]",
tree! [ F(func! {}) ]);
}
/// Test if escaped, but unbalanced parens are correctly parsed.
#[test]
#[rustfmt::skip]
fn parse_unbalanced_body_parens() {
let mut scope = Scope::new();
scope.add::<TreeFn>("code");
test_scoped(&scope, r"My [code][Close \]] end", tree! [
T("My"), S, F(func! { body: tree! [ T("Close"), S, T("]") ] }), S, T("end")
]);
test_scoped(&scope, r"My [code][\[ Open] end", tree! [
T("My"), S, F(func! { body: tree! [ T("["), S, T("Open") ] }), S, T("end")
]);
test_scoped(&scope, r"My [code][Open \] and \[ close]end", tree! [
T("My"), S, F(func! { body:
tree! [ T("Open"), S, T("]"), S, T("and"), S, T("["), S, T("close") ]
}), T("end")
]);
}
/// Tests if the parser handles non-ASCII stuff correctly.
#[test]
#[rustfmt::skip]
fn parse_unicode() {
let mut scope = Scope::new();
scope.add::<BodylessFn>("func");
scope.add::<TreeFn>("bold");
test_scoped(&scope, "[func] ⺐.", tree! [ F(func! {}), S, T("⺐.") ]);
test_scoped(&scope, "[bold][Hello 🌍!]", tree! [
F(func! { body: tree! [ T("Hello"), S, T("🌍!") ] })
]);
}
/// Tests whether spans get calculated correctly.
#[test]
#[rustfmt::skip]
fn parse_spans() {
fn test_span(src: &str, correct: Vec<(usize, usize, usize, usize)>) {
let mut scope = Scope::new();
scope.add::<TreeFn>("hello");
let tree = parse(src, ParseContext { scope: &scope }).unwrap();
let spans = tree.nodes.into_iter()
.map(|node| {
let Span { start, end } = node.span;
(start.line, start.column, end.line, end.column)
})
.collect::<Vec<_>>();
assert_eq!(spans, correct);
}
test_span("hello world", vec![(1, 0, 1, 5), (1, 5, 1, 6), (1, 6, 1, 11)]);
test_span("p1\n \np2", vec![(1, 0, 1, 2), (1, 2, 2, 2), (3, 0, 3, 2)]);
let src = "func\n [hello: pos, other][body\r\n _🌍_\n]";
test_span(src, vec![
(1, 0, 1, 4),
(1, 4, 2, 1),
(2, 1, 4, 1)
]);
}
/// Tests whether errors get reported correctly.
#[test]
#[rustfmt::skip]
fn parse_errors() {
let mut scope = Scope::new();
scope.add::<TreeFn>("hello");
test_err("No functions here]", "unexpected closing bracket");
test_err_scoped(&scope, "[hello][world", "expected closing bracket");
test_err("[hello world", "expected arguments or closing bracket");
test_err("[ no^name][Why?]", "invalid identifier: `no^name`");
test_err("Hello */", "unexpected end of block comment");
}
/// Tests syntax highlighting.
#[test]
#[rustfmt::skip]
fn test_highlighting() {
use ColorToken::{Bracket as B, FuncName as F, *};
let mut scope = Scope::new();
scope.add::<BodylessFn>("func");
scope.add::<TreeFn>("tree");
test_color(&scope, "[func]", vec![(0, 1, B), (1, 5, F), (5, 6, B)]);
test_color(&scope, "[func: 12pt]", vec![
(0, 1, B), (1, 5, F), (5, 6, Colon), (7, 11, ExprSize), (11, 12, B)
]);
test_color(&scope, "[func: x=25.3, y=\"hi\"]", vec![
(0, 1, B), (1, 5, F), (5, 6, Colon),
(7, 8, KeyArg), (8, 9, Equals), (9, 13, ExprNumber),
(13, 14, Comma),
(15, 16, KeyArg), (16, 17, Equals), (17, 21, ExprStr),
(21, 22, B),
]);
test_color(&scope, "Hello [tree][With [func: 3]]", vec![
(6, 7, B), (7, 11, F), (11, 12, B),
(12, 13, B), (18, 19, B)
]);
}
}

View File

@ -45,8 +45,6 @@ impl Span {
}
pub fn merge(a: Span, b: Span) -> Span {
let start = a.start.min(b.start);
Span {
start: a.start.min(b.start),
end: a.end.max(b.end),

View File

@ -1,88 +1,87 @@
//! Tokenization of source code.
use std::str::CharIndices;
use smallvec::SmallVec;
use std::iter::Peekable;
use std::str::Chars;
use super::*;
use Token::*;
use State::*;
/// Builds an iterator over the tokens of the source code.
pub fn tokenize(src: &str) -> Tokens {
Tokens::new(src)
}
/// An iterator over the tokens of source code.
#[derive(Debug, Clone)]
pub struct Tokens<'s> {
src: &'s str,
chars: PeekableChars<'s>,
state: TokensState,
stack: SmallVec<[TokensState; 1]>,
line: usize,
line_start_index: usize,
/// A minimal semantic entity of source code.
#[derive(Debug, Clone, PartialEq)]
pub enum Token<'s> {
/// One or more whitespace characters. The contained `usize` denotes the
/// number of newlines that were contained in the whitespace.
Whitespace(usize),
/// A line comment with inner string contents `//<&'s str>\n`.
LineComment(&'s str),
/// A block comment with inner string contents `/*<&'s str>*/`. The comment
/// can contain nested block comments.
BlockComment(&'s str),
/// An erroneous `*/` without an opening block comment.
StarSlash,
/// A left bracket: `[`.
LeftBracket,
/// A right bracket: `]`.
RightBracket,
/// A left parenthesis in a function header: `(`.
LeftParen,
/// A right parenthesis in a function header: `)`.
RightParen,
/// A left brace in a function header: `{`.
LeftBrace,
/// A right brace in a function header: `}`.
RightBrace,
/// A colon in a function header: `:`.
Colon,
/// A comma in a function header: `:`.
Comma,
/// An equals sign in a function header: `=`.
Equals,
/// An expression in a function header.
Expr(Expression),
/// A star in body-text.
Star,
/// An underscore in body-text.
Underscore,
/// A backtick in body-text.
Backtick,
/// Any other consecutive string.
Text(&'s str),
}
/// An iterator over the tokens of a string of source code.
pub struct Tokens<'s> {
src: &'s str,
chars: Characters<'s>,
state: State,
stack: Vec<State>,
}
/// The state the tokenizer is in.
#[derive(Debug, Copy, Clone, Eq, PartialEq)]
enum TokensState {
/// The base state if there is nothing special we are in.
enum State {
Header,
StartBody,
Body,
/// Inside a function header. Here colons and equal signs get parsed
/// as distinct tokens rather than text.
Function,
/// We expect either the end of the function or the beginning of the body.
MaybeBody,
}
impl<'s> Tokens<'s> {
/// Create a new token stream from source code.
pub fn new(src: &'s str) -> Tokens<'s> {
Tokens {
src,
chars: PeekableChars::new(src),
state: TokensState::Body,
stack: SmallVec::new(),
line: 1,
line_start_index: 0,
}
}
/// The index of the first character of the next token in the source string.
pub fn string_index(&self) -> usize {
self.chars.string_index()
}
/// Go to a new position in the underlying string.
pub fn set_string_index(&mut self, index: usize) {
self.chars.set_string_index(index);
}
/// The current position in the source.
pub fn get_position(&self) -> Position {
self.line_position(self.string_index())
}
/// Advance the iterator by one step.
fn advance(&mut self) {
self.chars.next();
}
/// Switch to the given state.
fn switch(&mut self, state: TokensState) {
self.stack.push(self.state);
self.state = state;
}
/// Go back to the top-of-stack state.
fn unswitch(&mut self) {
self.state = self.stack.pop().unwrap_or(TokensState::Body);
}
/// The `Position` with line and column for a string index.
fn line_position(&self, index: usize) -> Position {
Position {
line: self.line,
column: index - self.line_start_index,
chars: Characters::new(src),
state: State::Body,
stack: vec![],
}
}
}
@ -90,455 +89,281 @@ impl<'s> Tokens<'s> {
impl<'s> Iterator for Tokens<'s> {
type Item = Spanned<Token<'s>>;
/// Advance the iterator, return the next token or nothing.
fn next(&mut self) -> Option<Self::Item> {
use TokensState as TS;
/// Parse the next token in the source code.
fn next(&mut self) -> Option<Spanned<Token<'s>>> {
let start = self.chars.position();
let first = self.chars.next()?;
let second = self.chars.peek();
// Go to the body state if the function has a body or return to the top-of-stack
// state.
if self.state == TS::MaybeBody {
if let Some((index, '[')) = self.chars.peek() {
self.advance();
self.state = TS::Body;
let span = Span::at(self.line_position(index));
return Some(Spanned::new(Token::LeftBracket, span));
} else {
self.unswitch();
}
}
let token = match first {
// Comments.
'/' if second == Some('/') => self.parse_line_comment(),
'/' if second == Some('*') => self.parse_block_comment(),
'*' if second == Some('/') => { self.eat(); StarSlash }
// Take the next char and peek at the one behind.
let (pos, next) = self.chars.next()?;
let afterwards = self.chars.peekc();
// Whitespace.
c if c.is_whitespace() => self.parse_whitespace(c),
/// The index at which the line ended, if it did.
let mut eol = None;
let token = match next {
// Functions
'[' => {
self.switch(TS::Function);
Token::LeftBracket
}
// Functions.
'[' => { self.set_state(Header); LeftBracket }
']' => {
if self.state == TS::Function {
self.state = TS::MaybeBody;
if self.state == Header && second == Some('[') {
self.state = StartBody;
} else {
self.unswitch();
self.pop_state();
}
Token::RightBracket
RightBracket
}
// Line comment
'/' if afterwards == Some('/') => {
let start = self.string_index() + 1;
// Syntactic elements in function headers.
'(' if self.state == Header => LeftParen,
')' if self.state == Header => RightParen,
'{' if self.state == Header => LeftBrace,
'}' if self.state == Header => RightBrace,
':' if self.state == Header => Colon,
',' if self.state == Header => Comma,
'=' if self.state == Header => Equals,
while let Some(c) = self.chars.peekc() {
if is_newline_char(c) {
break;
}
self.advance();
}
// String values.
'"' if self.state == Header => self.parse_string(),
let end = self.string_index();
Token::LineComment(&self.src[start..end])
}
// Style toggles.
'*' if self.state == Body => Star,
'_' if self.state == Body => Underscore,
'`' if self.state == Body => Backtick,
// Block comment
'/' if afterwards == Some('*') => {
let start = self.string_index() + 1;
let mut nested = 0;
// An escaped thing.
'\\' => self.parse_escaped(),
while let Some((_, c)) = self.chars.next() {
let after = self.chars.peekc();
match (c, after) {
('*', Some('/')) if nested == 0 => {
self.advance();
break;
}
('/', Some('*')) => {
self.advance();
nested += 1
}
('*', Some('/')) => {
self.advance();
nested -= 1
}
_ => {}
}
}
let end = self.string_index() - 2;
Token::BlockComment(&self.src[start..end])
}
// Unexpected end of block comment
'*' if afterwards == Some('/') => {
self.advance();
Token::StarSlash
}
// Whitespace
' ' | '\t' => {
while let Some(c) = self.chars.peekc() {
match c {
' ' | '\t' => self.advance(),
_ => break,
}
}
Token::Space
}
// Newlines
'\r' if afterwards == Some('\n') => {
self.advance();
eol = Some(pos + "\r\n".len());
Token::Newline
}
c if is_newline_char(c) => {
eol = Some(pos + c.len_utf8());
Token::Newline
}
// Star/Underscore/Backtick in bodies
'*' if self.state == TS::Body => Token::Star,
'_' if self.state == TS::Body => Token::Underscore,
'`' if self.state == TS::Body => Token::Backtick,
// Context sensitive operators in headers
':' if self.state == TS::Function => Token::Colon,
'=' if self.state == TS::Function => Token::Equals,
',' if self.state == TS::Function => Token::Comma,
// A string value.
'"' if self.state == TS::Function => {
let start = self.string_index();
let mut end = start;
let mut escaped = false;
while let Some((index, c)) = self.chars.next() {
end = index;
if c == '"' && !escaped {
break;
}
escaped = c == '\\';
}
Token::Quoted(&self.src[start..end])
}
// Escaping
'\\' => {
if let Some((index, c)) = self.chars.peek() {
let escapable = match c {
'[' | ']' | '\\' | '*' | '_' | '`' | ':' | '=' | ',' | '/' => true,
// Expressions or just strings.
c => {
let word = self.read_string_until(|n| {
match n {
c if c.is_whitespace() => true,
'\\' | '[' | ']' | '*' | '_' | '`' | ':' | '=' |
',' | '"' | '/' => true,
_ => false,
};
if escapable {
self.advance();
Token::Text(&self.src[index..index + c.len_utf8()])
} else {
Token::Text("\\")
}
}, false, -(c.len_utf8() as isize), 0);
if self.state == Header {
self.parse_expr(word)
} else {
Token::Text("\\")
Text(word)
}
}
// Normal text
_ => {
// Find out when the word ends.
while let Some((_, c)) = self.chars.peek() {
let second = self.chars.peekn(1).map(|p| p.1);
// Whether the next token is still from the text or not.
let continues = match c {
'[' | ']' | '\\' => false,
'*' | '_' | '`' if self.state == TS::Body => false,
':' | '=' | ',' | '"' if self.state == TS::Function => false,
'/' => second != Some('/') && second != Some('*'),
'*' => second != Some('/'),
' ' | '\t' => false,
c if is_newline_char(c) => false,
_ => true,
};
if !continues {
break;
}
self.advance();
}
let end = self.string_index();
Token::Text(&self.src[pos..end])
}
};
let start = self.line_position(pos);
let end = self.get_position();
let span = Span::new(start, end);
let end = self.chars.position();
let span = Span { start, end };
if let Some(index) = eol {
self.line += 1;
self.line_start_index = index;
}
Some(Spanned::new(token, span))
Some(Spanned { v: token, span })
}
}
/// Whether this character is a newline (or starts one).
pub(crate) fn is_newline_char(character: char) -> bool {
impl<'s> Tokens<'s> {
fn parse_line_comment(&mut self) -> Token<'s> {
LineComment(self.read_string_until(is_newline_char, false, 1, 0))
}
fn parse_block_comment(&mut self) -> Token<'s> {
enum Last { Slash, Star, Other }
use Last::*;
self.eat();
let mut depth = 0;
let mut last = Last::Other;
// Find the first `*/` that does not correspond to a nested `/*`.
// Remove the last two bytes to obtain the raw inner text without `*/`.
BlockComment(self.read_string_until(|n| {
match n {
'/' => match last {
Star if depth == 0 => return true,
Star => depth -= 1,
_ => last = Slash
}
'*' => match last {
Slash => depth += 1,
_ => last = Star,
}
_ => last = Other,
}
false
}, true, 0, -2))
}
fn parse_whitespace(&mut self, c: char) -> Token<'s> {
let mut newlines = if is_newline_char(c) { 1 } else { 0 };
let mut last = c;
self.read_string_until(|n| {
if is_newline_char(n) && !(last == '\r' && n == '\n') {
newlines += 1;
}
last = n;
!n.is_whitespace()
}, false, 0, 0);
Whitespace(newlines)
}
fn parse_string(&mut self) -> Token<'s> {
let mut escaped = false;
Expr(Expression::Str(self.read_string_until(|n| {
if n == '"' && !escaped {
return true;
} else if n == '\\' {
escaped = !escaped;
} else {
escaped = false;
}
false
}, true, 0, -1).to_string()))
}
fn parse_escaped(&mut self) -> Token<'s> {
fn is_escapable(c: char) -> bool {
match c {
'\\' | '[' | ']' | '*' | '_' | '`' | '/' => true,
_ => false,
}
}
let c = self.chars.peek().unwrap_or('n');
if self.state == Body && is_escapable(c) {
let index = self.chars.index();
self.eat();
Text(&self.src[index .. index + c.len_utf8()])
} else {
Text("\\")
}
}
fn parse_expr(&mut self, word: &'s str) -> Token<'s> {
if let Ok(b) = word.parse::<bool>() {
Expr(Expression::Bool(b))
} else if let Ok(num) = word.parse::<f64>() {
Expr(Expression::Num(num))
} else if let Ok(num) = parse_percentage(word) {
Expr(Expression::Num(num / 100.0))
} else if let Ok(size) = word.parse::<Size>() {
Expr(Expression::Size(size))
} else if let Some(ident) = Ident::new(word) {
Expr(Expression::Ident(ident))
} else {
Text(word)
}
}
fn read_string_until<F>(
&mut self,
mut f: F,
eat_match: bool,
offset_start: isize,
offset_end: isize,
) -> &'s str where F: FnMut(char) -> bool {
let start = ((self.chars.index() as isize) + offset_start) as usize;
let mut matched = false;
while let Some(c) = self.chars.peek() {
if f(c) {
matched = true;
if eat_match {
self.chars.next();
}
break;
}
self.chars.next();
}
let mut end = self.chars.index();
if matched {
end = ((end as isize) + offset_end) as usize;
}
&self.src[start .. end]
}
fn set_state(&mut self, state: State) {
self.stack.push(self.state);
self.state = state;
}
fn pop_state(&mut self) {
self.state = self.stack.pop().unwrap_or(Body);
}
fn eat(&mut self) {
self.chars.next();
}
}
fn parse_percentage(word: &str) -> Result<f64, ()> {
if word.ends_with('%') {
word[.. word.len() - 1].parse::<f64>().map_err(|_| ())
} else {
Err(())
}
}
/// Whether this character denotes a newline.
fn is_newline_char(character: char) -> bool {
match character {
'\n' | '\r' | '\u{000c}' | '\u{0085}' | '\u{2028}' | '\u{2029}' => true,
// Line Feed, Vertical Tab, Form Feed, Carriage Return.
'\x0A' ..= '\x0D' => true,
// Next Line, Line Separator, Paragraph Separator.
'\u{0085}' | '\u{2028}' | '\u{2029}' => true,
_ => false,
}
}
/// A (index, char) iterator with double lookahead.
#[derive(Debug, Clone)]
struct PeekableChars<'s> {
string: &'s str,
chars: CharIndices<'s>,
peeked: SmallVec<[Option<(usize, char)>; 2]>,
base: usize,
struct Characters<'s> {
iter: Peekable<Chars<'s>>,
position: Position,
index: usize,
}
impl<'s> PeekableChars<'s> {
/// Create a new iterator from a string.
fn new(string: &'s str) -> PeekableChars<'s> {
PeekableChars {
string,
chars: string.char_indices(),
peeked: SmallVec::new(),
base: 0,
impl<'s> Characters<'s> {
fn new(src: &'s str) -> Characters<'s> {
Characters {
iter: src.chars().peekable(),
position: Position::new(0, 0),
index: 0,
}
}
/// Peek at the next element.
fn peek(&mut self) -> Option<(usize, char)> {
self.peekn(0)
}
fn next(&mut self) -> Option<char> {
let c = self.iter.next()?;
let len = c.len_utf8();
/// Peek at the char of the next element.
fn peekc(&mut self) -> Option<char> {
self.peekn(0).map(|p| p.1)
}
self.index += len;
/// Peek at the element after the next element.
fn peekn(&mut self, n: usize) -> Option<(usize, char)> {
while self.peeked.len() <= n {
let next = self.next_inner();
self.peeked.push(next);
if is_newline_char(c) && !(c == '\r' && self.peek() == Some('\n')) {
self.position.line += 1;
self.position.column = 0;
} else {
self.position.column += len;
}
self.peeked[n]
Some(c)
}
/// Return the next value of the inner iterator mapped with the offset.
fn next_inner(&mut self) -> Option<(usize, char)> {
self.chars.next().map(|(i, c)| (self.base + i, c))
fn peek(&mut self) -> Option<char> {
self.iter.peek().copied()
}
fn string_index(&self) -> usize {
fn index(&self) -> usize {
self.index
}
fn set_string_index(&mut self, index: usize) {
self.chars = self.string[index..].char_indices();
self.base = index;
self.index = 0;
self.peeked.clear();
}
}
impl Iterator for PeekableChars<'_> {
type Item = (usize, char);
fn next(&mut self) -> Option<(usize, char)> {
let next = if !self.peeked.is_empty() {
self.peeked.remove(0)
} else {
self.next_inner()
};
if let Some((index, c)) = next {
self.index = index + c.len_utf8();
}
next
}
}
#[cfg(test)]
mod tests {
use super::*;
use Token::{
Backtick as TB, BlockComment as BC, Colon as C, Equals as E, LeftBracket as L,
LineComment as LC, Newline as N, Quoted as Q, RightBracket as R, Space as S, Star as TS,
StarSlash as SS, Text as T, Underscore as TU,
};
/// Test if the source code tokenizes to the tokens.
fn test(src: &str, tokens: Vec<Token>) {
assert_eq!(Tokens::new(src)
.map(|token| token.v)
.collect::<Vec<_>>(), tokens);
}
/// Test if the tokens of the source code have the correct spans.
fn test_span(src: &str, spans: Vec<(usize, usize, usize, usize)>) {
assert_eq!(Tokens::new(src)
.map(|token| {
let Span { start, end } = token.span;
(start.line, start.column, end.line, end.column)
})
.collect::<Vec<_>>(), spans);
}
/// Tokenizes the basic building blocks.
#[test]
#[rustfmt::skip]
fn tokenize_base() {
test("", vec![]);
test("Hallo", vec![T("Hallo")]);
test("[", vec![L]);
test("]", vec![R]);
test("*", vec![TS]);
test("_", vec![TU]);
test("`", vec![TB]);
test("\n", vec![N]);
}
/// This test looks if LF- and CRLF-style newlines get both identified correctly.
#[test]
#[rustfmt::skip]
fn tokenize_whitespace_newlines() {
test(" \t", vec![S]);
test("First line\r\nSecond line\nThird line\n", vec![
T("First"), S, T("line"), N, T("Second"), S, T("line"), N,
T("Third"), S, T("line"), N
]);
test("Hello \n ", vec![T("Hello"), S, N, S]);
test("Dense\nTimes", vec![T("Dense"), N, T("Times")]);
}
/// Tests if escaping with backslash works as it should.
#[test]
#[rustfmt::skip]
fn tokenize_escape() {
test(r"\[", vec![T("[")]);
test(r"\]", vec![T("]")]);
test(r"\**", vec![T("*"), TS]);
test(r"\*", vec![T("*")]);
test(r"\__", vec![T("_"), TU]);
test(r"\_", vec![T("_")]);
test(r"\hello", vec![T("\\"), T("hello")]);
}
/// Tests if escaped strings work.
#[test]
#[rustfmt::skip]
fn tokenize_quoted() {
test(r#"[align: "hello\"world"]"#, vec![L, T("align"), C, S, Q(r#"hello\"world"#), R]);
}
/// Tokenizes some more realistic examples.
#[test]
#[rustfmt::skip]
fn tokenize_examples() {
test(r"
[function][
Test [italic][example]!
]
", vec![
N, S, L, T("function"), R, L, N, S, T("Test"), S, L, T("italic"), R, L,
T("example"), R, T("!"), N, S, R, N, S
]);
test(r"
[page: size=A4]
[font: size=12pt]
Das ist ein Beispielsatz mit *fetter* Schrift.
", vec![
N, S, L, T("page"), C, S, T("size"), E, T("A4"), R, N, S,
L, T("font"), C, S, T("size"), E, T("12pt"), R, N, N, S,
T("Das"), S, T("ist"), S, T("ein"), S, T("Beispielsatz"), S, T("mit"), S,
TS, T("fetter"), TS, S, T("Schrift."), N, S
]);
}
/// This test checks whether the colon and equals symbols get parsed correctly depending on the
/// context: Either in a function header or in a body.
#[test]
#[rustfmt::skip]
fn tokenize_symbols_context() {
test("[func: key=value][Answer: 7]", vec![
L, T("func"), C, S, T("key"), E, T("value"), R, L,
T("Answer:"), S, T("7"), R
]);
test("[[n: k=v]:x][:[=]]:=", vec![
L, L, T("n"), C, S, T("k"), E, T("v"), R, C, T("x"), R,
L, T(":"), L, E, R, R, T(":=")
]);
test("[hi: k=[func][body] v=1][hello]", vec![
L, T("hi"), C, S, T("k"), E, L, T("func"), R, L, T("body"), R, S,
T("v"), E, T("1"), R, L, T("hello"), R
]);
test("[func: __key__=value]", vec![L, T("func"), C, S, T("__key__"), E, T("value"), R]);
test("The /*[*/ answer: 7.", vec![T("The"), S, BC("["), S, T("answer:"), S, T("7.")]);
}
/// Test if block and line comments get tokenized as expected.
#[test]
#[rustfmt::skip]
fn tokenize_comments() {
test("These // Line comments.", vec![T("These"), S, LC(" Line comments.")]);
test("This /* is */ a comment.", vec![T("This"), S, BC(" is "), S, T("a"), S, T("comment.")]);
test("[Head/*of*/][Body]", vec![L, T("Head"), BC("of"), R, L, T("Body"), R]);
test("/* Hey */ */", vec![BC(" Hey "), S, SS]);
test("Hey\n// Yoo /*\n*/", vec![T("Hey"), N, LC(" Yoo /*"), N, SS]);
test("/* My /* line // */ comment */", vec![BC(" My /* line // */ comment ")])
}
/// This test has a special look at the underscore syntax.
#[test]
#[rustfmt::skip]
fn tokenize_underscores() {
test("he_llo_world_ __ Now this_ is_ special!",
vec![T("he"), TU, T("llo"), TU, T("world"), TU, S, TU, TU, S, T("Now"), S,
T("this"), TU, S, T("is"), TU, S, T("special!")]);
}
/// This test is for checking if non-ASCII characters get parsed correctly.
#[test]
#[rustfmt::skip]
fn tokenize_unicode() {
test("[document][Hello 🌍!]", vec![L, T("document"), R, L, T("Hello"), S, T("🌍!"), R]);
test("[f]⺐.", vec![L, T("f"), R, T("⺐.")]);
}
/// This test checks if all tokens have the correct spans.
#[test]
#[rustfmt::skip]
fn tokenize_spans() {
test_span("Hello World", vec![(1, 0, 1, 5), (1, 5, 1, 6), (1, 6, 1, 11)]);
test_span("🌍_🎈", vec![(1, 0, 1, 4), (1, 4, 1, 5), (1, 5, 1, 9)]);
test_span("hello\nworld", vec![(1, 0, 1, 5), (1, 5, 1, 6), (2, 0, 2, 5)]);
test_span("[hello: world]", vec![
(1, 0, 1, 1), (1, 1, 1, 6), (1, 6, 1, 7),
(1, 7, 1, 8), (1, 8, 1, 13), (1, 13, 1, 14)
]);
fn position(&self) -> Position {
self.position
}
}

View File

@ -1,9 +1,26 @@
#![allow(unused_imports)]
#![allow(non_snake_case)]
use typstc::size::Size;
use typstc::syntax::*;
use Token::{
Space as S, Newline as N, LeftBracket as LB,
RightBracket as RB, Text as T, *
Whitespace as W,
LineComment as LC, BlockComment as BC, StarSlash as SS,
LeftBracket as LB, RightBracket as RB,
LeftParen as LP, RightParen as RP,
LeftBrace as LBR, RightBrace as RBR,
Colon as CL, Comma as CM, Equals as EQ, Expr as E,
Star as ST, Underscore as U, Backtick as B, Text as T,
};
use Expression as Expr;
fn ID(ident: &str) -> Token { E(Expr::Ident(Ident::new(ident.to_string()).unwrap())) }
fn STR(ident: &str) -> Token { E(Expr::Str(ident.to_string())) }
fn SIZE(size: Size) -> Token<'static> { E(Expr::Size(size)) }
fn NUM(num: f64) -> Token<'static> { E(Expr::Num(num)) }
fn BOOL(b: bool) -> Token<'static> { E(Expr::Bool(b)) }
/// Parses the test syntax.
macro_rules! tokens {
($($src:expr =>($line:expr)=> $tokens:expr)*) => ({

View File

@ -1,78 +0,0 @@
// Spaces, Newlines, Brackets.
"" => []
" " => [S]
" " => [S]
"\t" => [S]
" \t" => [S]
"\n" => [N]
"\n " => [N, S]
" \n" => [S, N]
" \n " => [S, N, S]
"[" => [LB]
"]" => [RB]
// Header only tokens.
"[:]" => [LB, Colon, RB]
"[=]" => [LB, Equals, RB]
"[,]" => [LB, Comma, RB]
":" => [T(":")]
"=" => [T("=")]
"," => [T(",")]
r#"["hi"]"# => [LB, Quoted("hi"), RB]
r#""hi""# => [T(r#""hi""#)]
// Body only tokens.
"_" => [Underscore]
"*" => [Star]
"`" => [Backtick]
"[_]" => [LB, T("_"), RB]
"[*]" => [LB, T("*"), RB]
"[`]" => [LB, T("`"), RB]
// Comments.
"//line" => [LineComment("line")]
"/*block*/" => [BlockComment("block")]
"*/" => [StarSlash]
// Plain text.
"A" => [T("A")]
"Hello" => [T("Hello")]
"Hello-World" => [T("Hello-World")]
r#"A"B"# => [T(r#"A"B"#)]
"🌍" => [T("🌍")]
// Escapes.
r"\[" => [T("[")]
r"\]" => [T("]")]
r"\\" => [T(r"\")]
r"[\[]" => [LB, T("["), RB]
r"[\]]" => [LB, T("]"), RB]
r"[\\]" => [LB, T(r"\"), RB]
r"\:" => [T(":")]
r"\=" => [T("=")]
r"\/" => [T("/")]
r"[\:]" => [LB, T(":"), RB]
r"[\=]" => [LB, T("="), RB]
r"[\,]" => [LB, T(","), RB]
r"\*" => [T("*")]
r"\_" => [T("_")]
r"\`" => [T("`")]
r"[\*]" => [LB, T("*"), RB]
r"[\_]" => [LB, T("_"), RB]
r"[\`]" => [LB, T("`"), RB]
// Whitespace.
"Hello World" => [T("Hello"), S, T("World")]
"Hello World" => [T("Hello"), S, T("World")]
"Hello \t World" => [T("Hello"), S, T("World")]
// Newline.
"First\n" => [T("First"), N]
"First \n" => [T("First"), S, N]
"First\n " => [T("First"), N, S]
"First \n " => [T("First"), S, N, S]
"First\nSecond" => [T("First"), N, T("Second")]
"First\r\nSecond" => [T("First"), N, T("Second")]
"First \nSecond" => [T("First"), S, N, T("Second")]
"First\n Second" => [T("First"), N, S, T("Second")]
"First \n Second" => [T("First"), S, N, S, T("Second")]

62
tests/parsing/tokens.rs Normal file
View File

@ -0,0 +1,62 @@
// Whitespace.
"" => []
" " => [W(0)]
" " => [W(0)]
"\t" => [W(0)]
" \t" => [W(0)]
"\n" => [W(1)]
"\n " => [W(1)]
" \n" => [W(1)]
" \n " => [W(1)]
" \n\t \n " => [W(2)]
"\r\n" => [W(1)]
" \r\r\n \x0D" => [W(3)]
"\n\r" => [W(2)]
// Comments.
"a // bc\n " => [T("a"), W(0), LC(" bc"), W(1)]
"a //a//b\n " => [T("a"), W(0), LC("a//b"), W(1)]
"a //a//b\r\n" => [T("a"), W(0), LC("a//b"), W(1)]
"a //a//b\n\nhello" => [T("a"), W(0), LC("a//b"), W(2), T("hello")]
"/**/" => [BC("")]
"_/*_/*a*/*/" => [U, BC("_/*a*/")]
"/*/*/" => [BC("/*/")]
"abc*/" => [T("abc"), SS]
// Header only tokens.
"[" => [LB]
"]" => [RB]
"[(){}:=,]" => [LB, LP, RP, LBR, RBR, CL, EQ, CM, RB]
"[a:b]" => [LB, ID("a"), CL, ID("b"), RB]
"[🌓, 🌍,]" => [LB, T("🌓"), CM, W(0), T("🌍"), CM, RB]
"[=]" => [LB, EQ, RB]
"[,]" => [LB, CM, RB]
"a: b" => [T("a"), T(":"), W(0), T("b")]
"c=d, " => [T("c"), T("=d"), T(","), W(0)]
r#"["hello\"world"]"# => [LB, STR(r#"hello\"world"#), RB]
r#"["hi", 12pt]"# => [LB, STR("hi"), CM, W(0), SIZE(Size::pt(12.0)), RB]
"\"hi\"" => [T("\"hi"), T("\"")]
"[a: true, x=1]" => [LB, ID("a"), CL, W(0), BOOL(true), CM, W(0),
ID("x"), EQ, NUM(1.0), RB]
"[120%]" => [LB, NUM(1.2), RB]
// Body only tokens.
"_*`" => [U, ST, B]
"[_*`]" => [LB, T("_"), T("*"), T("`"), RB]
"hi_you_ there" => [T("hi"), U, T("you"), U, W(0), T("there")]
// Escapes.
r"\[" => [T("[")]
r"\]" => [T("]")]
r"\\" => [T(r"\")]
r"\/" => [T("/")]
r"\*" => [T("*")]
r"\_" => [T("_")]
r"\`" => [T("`")]
// Unescapable special symbols.
r"\:" => [T(r"\"), T(":")]
r"\=" => [T(r"\"), T("=")]
r"[\:]" => [LB, T(r"\"), CL, RB]
r"[\=]" => [LB, T(r"\"), EQ, RB]
r"[\,]" => [LB, T(r"\"), CM, RB]