Remove the concept of words from tokenization 🎈

This commit is contained in:
Laurenz 2019-04-30 09:15:31 +02:00
parent 90848df5de
commit 9d605c3128
6 changed files with 152 additions and 289 deletions

View File

@ -7,7 +7,6 @@ edition = "2018"
[dependencies]
pdf = { path = "../pdf" }
opentype = { path = "../opentype" }
unicode-segmentation = "1.2"
unicode-xid = "0.1.0"
byteorder = "1"
smallvec = "0.6.9"

View File

@ -62,7 +62,7 @@ impl<'a> Engine<'a> {
// Iterate through the documents nodes.
for node in &self.tree.nodes {
match node {
Node::Word(word) => self.write_word(word)?,
Node::Text(text) => self.write_word(text)?,
Node::Space => self.write_space()?,
Node::Newline => {
self.write_buffered_text();

View File

@ -53,7 +53,6 @@ use crate::syntax::SyntaxTree;
#[macro_use]
mod error;
mod utility;
pub mod doc;
pub mod engine;
pub mod export;

View File

@ -1,30 +1,29 @@
//! Tokenization and parsing of source code into syntax trees.
use std::collections::HashMap;
use std::fmt;
use std::iter::Peekable;
use std::mem::swap;
use std::ops::Deref;
use std::str::CharIndices;
use unicode_segmentation::{UnicodeSegmentation, UWordBounds};
use unicode_xid::UnicodeXID;
use crate::syntax::*;
use crate::func::{ParseContext, Scope};
use crate::utility::{Splinor, Spline, Splined, StrExt};
/// An iterator over the tokens of source code.
#[derive(Clone)]
#[derive(Debug, Clone)]
pub struct Tokens<'s> {
source: &'s str,
words: Peekable<UWordBounds<'s>>,
state: TokensState<'s>,
stack: Vec<TokensState<'s>>,
chars: Peekable<CharIndices<'s>>,
state: TokensState,
stack: Vec<TokensState>,
}
/// The state the tokenizer is in.
#[derive(Debug, Clone)]
enum TokensState<'s> {
#[derive(Debug, Clone, PartialEq)]
enum TokensState {
/// The base state if there is nothing special we are in.
Body,
/// Inside a function header. Here colons and equal signs get parsed
@ -32,9 +31,6 @@ enum TokensState<'s> {
Function,
/// We expect either the end of the function or the beginning of the body.
MaybeBody,
/// We are inside one unicode word that consists of multiple tokens,
/// because it contains double underscores.
DoubleUnderscore(Spline<'s, Token<'s>>),
}
impl<'s> Tokens<'s> {
@ -43,7 +39,7 @@ impl<'s> Tokens<'s> {
pub fn new(source: &'s str) -> Tokens<'s> {
Tokens {
source,
words: source.split_word_bounds().peekable(),
chars: source.char_indices().peekable(),
state: TokensState::Body,
stack: vec![],
}
@ -51,11 +47,11 @@ impl<'s> Tokens<'s> {
/// Advance the iterator by one step.
fn advance(&mut self) {
self.words.next();
self.chars.next();
}
/// Switch to the given state.
fn switch(&mut self, mut state: TokensState<'s>) {
fn switch(&mut self, mut state: TokensState) {
swap(&mut state, &mut self.state);
self.stack.push(state);
}
@ -70,6 +66,11 @@ impl<'s> Tokens<'s> {
self.advance();
token
}
/// Returns a word containing the string bounded by the given indices.
fn text(&self, start: usize, end: usize) -> Token<'s> {
Token::Text(&self.source[start .. end])
}
}
impl<'s> Iterator for Tokens<'s> {
@ -79,27 +80,11 @@ impl<'s> Iterator for Tokens<'s> {
fn next(&mut self) -> Option<Token<'s>> {
use TokensState as TS;
// Return the remaining words and double underscores.
if let TS::DoubleUnderscore(splinor) = &mut self.state {
loop {
if let Some(splined) = splinor.next() {
return Some(match splined {
Splined::Value(word) if word != "" => Token::Word(word),
Splined::Splinor(s) => s,
_ => continue,
});
} else {
self.unswitch();
break;
}
}
}
// Skip whitespace, but if at least one whitespace word existed,
// remember that, because we return a space token.
// Skip whitespace, but if at least one whitespace character existed,
// remember that, because then we return a space token.
let mut whitespace = false;
while let Some(word) = self.words.peek() {
if !word.is_whitespace() {
while let Some(&(_, c)) = self.chars.peek() {
if !c.is_whitespace() || c == '\n' || c == '\r' {
break;
}
whitespace = true;
@ -111,101 +96,83 @@ impl<'s> Iterator for Tokens<'s> {
// Function maybe has a body
if self.state == TS::MaybeBody {
match *self.words.peek()? {
"[" => {
if self.chars.peek()?.1 == '[' {
self.state = TS::Body;
return Some(self.consumed(Token::LeftBracket));
},
_ => self.unswitch(),
} else {
self.unswitch();
}
}
// Now all special cases are handled and we can finally look at the
// next words.
let next = self.words.next()?;
let afterwards = self.words.peek();
let (next_pos, next) = self.chars.next()?;
let afterwards = self.chars.peek().map(|&(_, c)| c);
Some(match next {
// Special characters
"[" => {
'[' => {
self.switch(TS::Function);
Token::LeftBracket
},
"]" => {
']' => {
if self.state == TS::Function {
self.state = TS::MaybeBody;
}
Token::RightBracket
},
"$" => Token::Dollar,
"#" => Token::Hashtag,
'$' => Token::Dollar,
'#' => Token::Hashtag,
// Context sensitive operators
":" if self.state == TS::Function => Token::Colon,
"=" if self.state == TS::Function => Token::Equals,
':' if self.state == TS::Function => Token::Colon,
'=' if self.state == TS::Function => Token::Equals,
// Double star/underscore
"*" if afterwards == Some(&"*") => self.consumed(Token::DoubleStar),
"__" => Token::DoubleUnderscore,
'*' if afterwards == Some('*') => self.consumed(Token::DoubleStar),
'_' if afterwards == Some('_') => self.consumed(Token::DoubleUnderscore),
// Newlines
"\n" | "\r\n" => Token::Newline,
'\n' => Token::Newline,
'\r' if afterwards == Some('\n') => self.consumed(Token::Newline),
// Escaping
r"\" => {
if let Some(next) = afterwards {
let escapable = match *next {
"[" | "]" | "$" | "#" | r"\" | ":" | "=" | "*" | "_" => true,
w if w.starts_with("__") => true,
_ => false,
};
if escapable {
let next = *next;
'\\' => {
if let Some(&(index, c)) = self.chars.peek() {
if is_special_character(c) {
self.advance();
return Some(Token::Word(next));
return Some(self.text(index, index + c.len_utf8()));
}
}
Token::Word(r"\")
},
// Double underscores hidden in words.
word if word.contains("__") => {
let spline = word.spline("__", Token::DoubleUnderscore);
self.switch(TS::DoubleUnderscore(spline));
return self.next();
Token::Text("\\")
},
// Now it seems like it's just a normal word.
word => Token::Word(word),
_ => {
// Find out when the word ends.
let mut end = (next_pos, next);
while let Some(&(index, c)) = self.chars.peek() {
if is_special_character(c) || c.is_whitespace() {
break;
}
end = (index, c);
self.advance();
}
let end_pos = end.0 + end.1.len_utf8();
self.text(next_pos, end_pos)
},
})
}
}
impl fmt::Debug for Tokens<'_> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
f.debug_struct("Tokens")
.field("source", &self.source)
.field("words", &"Peekable<UWordBounds>")
.field("state", &self.state)
.field("stack", &self.stack)
.finish()
}
}
impl PartialEq for TokensState<'_> {
fn eq(&self, other: &TokensState) -> bool {
use TokensState as TS;
match (self, other) {
(TS::Body, TS::Body) => true,
(TS::Function, TS::Function) => true,
(TS::MaybeBody, TS::MaybeBody) => true,
// They are not necessarily different, but we don't care
/// Whether this character has a special meaning in the language.
fn is_special_character(character: char) -> bool {
match character {
'[' | ']' | '$' | '#' | '\\' | ':' | '=' | '*' | '_' => true,
_ => false,
}
}
}
/// Transforms token streams to syntax trees.
@ -285,8 +252,8 @@ impl<'s, 't> Parser<'s, 't> {
Token::Space => self.append_space_consumed(),
Token::Newline => self.switch_consumed(PS::FirstNewline),
// Words
Token::Word(word) => self.append_consumed(Node::Word(word.to_owned())),
// Text
Token::Text(word) => self.append_consumed(Node::Text(word.to_owned())),
// Functions
Token::LeftBracket => self.parse_function()?,
@ -315,7 +282,7 @@ impl<'s, 't> Parser<'s, 't> {
// The next token should be the name of the function.
let name = match self.tokens.next() {
Some(Token::Word(word)) => {
Some(Token::Text(word)) => {
if word.is_identifier() {
Ok(word.to_owned())
} else {
@ -537,6 +504,39 @@ impl<'s> Iterator for ParseTokens<'s> {
}
}
/// More useful functions on `str`'s.
trait StrExt {
/// Whether self consists only of whitespace.
fn is_whitespace(&self) -> bool;
/// Whether this word is a valid unicode identifier.
fn is_identifier(&self) -> bool;
}
impl StrExt for str {
fn is_whitespace(&self) -> bool {
self.chars().all(|c| c.is_whitespace() && c != '\n')
}
fn is_identifier(&self) -> bool {
let mut chars = self.chars();
match chars.next() {
Some(c) if !UnicodeXID::is_xid_start(c) => return false,
None => return false,
_ => (),
}
while let Some(c) = chars.next() {
if !UnicodeXID::is_xid_continue(c) {
return false;
}
}
true
}
}
/// The error type for parsing.
pub struct ParseError(String);
@ -560,7 +560,7 @@ mod token_tests {
use super::*;
use Token::{Space as S, Newline as N, LeftBracket as L, RightBracket as R,
Colon as C, Equals as E, DoubleUnderscore as DU, DoubleStar as DS,
Dollar as D, Hashtag as H, Word as W};
Dollar as D, Hashtag as H, Text as T};
/// Test if the source code tokenizes to the tokens.
fn test(src: &str, tokens: Vec<Token>) {
@ -571,7 +571,7 @@ mod token_tests {
#[test]
fn tokenize_base() {
test("", vec![]);
test("Hallo", vec![W("Hallo")]);
test("Hallo", vec![T("Hallo")]);
test("[", vec![L]);
test("]", vec![R]);
test("$", vec![D]);
@ -586,26 +586,26 @@ mod token_tests {
fn tokenize_whitespace_newlines() {
test(" \t", vec![S]);
test("First line\r\nSecond line\nThird line\n",
vec![W("First"), S, W("line"), N, W("Second"), S, W("line"), N,
W("Third"), S, W("line"), N]);
test("Hello \n ", vec![W("Hello"), S, N, S]);
test("Dense\nTimes", vec![W("Dense"), N, W("Times")]);
vec![T("First"), S, T("line"), N, T("Second"), S, T("line"), N,
T("Third"), S, T("line"), N]);
test("Hello \n ", vec![T("Hello"), S, N, S]);
test("Dense\nTimes", vec![T("Dense"), N, T("Times")]);
}
/// Tests if escaping with backslash works as it should.
#[test]
fn tokenize_escape() {
test(r"\[", vec![W("[")]);
test(r"\]", vec![W("]")]);
test(r"\#", vec![W("#")]);
test(r"\$", vec![W("$")]);
test(r"\:", vec![W(":")]);
test(r"\=", vec![W("=")]);
test(r"\**", vec![W("*"), W("*")]);
test(r"\*", vec![W("*")]);
test(r"\__", vec![W("__")]);
test(r"\_", vec![W("_")]);
test(r"\hello", vec![W(r"\"), W("hello")]);
test(r"\[", vec![T("[")]);
test(r"\]", vec![T("]")]);
test(r"\#", vec![T("#")]);
test(r"\$", vec![T("$")]);
test(r"\:", vec![T(":")]);
test(r"\=", vec![T("=")]);
test(r"\**", vec![T("*"), T("*")]);
test(r"\*", vec![T("*")]);
test(r"\__", vec![T("_"), T("_")]);
test(r"\_", vec![T("_")]);
test(r"\hello", vec![T("\\"), T("hello")]);
}
/// Tokenizes some more realistic examples.
@ -616,8 +616,8 @@ mod token_tests {
Test [italic][example]!
]
", vec![
N, S, L, W("function"), R, L, N, S, W("Test"), S, L, W("italic"), R, L,
W("example"), R, W("!"), N, S, R, N, S
N, S, L, T("function"), R, L, N, S, T("Test"), S, L, T("italic"), R, L,
T("example"), R, T("!"), N, S, R, N, S
]);
test(r"
@ -626,10 +626,10 @@ mod token_tests {
Das ist ein Beispielsatz mit **fetter** Schrift.
", vec![
N, S, L, W("page"), C, S, W("size"), E, W("A4"), R, N, S,
L, W("font"), C, S, W("size"), E, W("12pt"), R, N, N, S,
W("Das"), S, W("ist"), S, W("ein"), S, W("Beispielsatz"), S, W("mit"), S,
DS, W("fetter"), DS, S, W("Schrift"), W("."), N, S
N, S, L, T("page"), C, S, T("size"), E, T("A4"), R, N, S,
L, T("font"), C, S, T("size"), E, T("12pt"), R, N, N, S,
T("Das"), S, T("ist"), S, T("ein"), S, T("Beispielsatz"), S, T("mit"), S,
DS, T("fetter"), DS, S, T("Schrift."), N, S
]);
}
@ -638,13 +638,13 @@ mod token_tests {
#[test]
fn tokenize_symbols_context() {
test("[func: key=value][Answer: 7]",
vec![L, W("func"), C, S, W("key"), E, W("value"), R, L,
W("Answer"), W(":"), S, W("7"), R]);
vec![L, T("func"), C, S, T("key"), E, T("value"), R, L,
T("Answer"), T(":"), S, T("7"), R]);
test("[[n: k=v]:x][:[=]]:=",
vec![L, L, W("n"), C, S, W("k"), E, W("v"), R, C, W("x"), R,
L, W(":"), L, E, R, R, W(":"), W("=")]);
vec![L, L, T("n"), C, S, T("k"), E, T("v"), R, C, T("x"), R,
L, T(":"), L, E, R, R, T(":"), T("=")]);
test("[func: __key__=value]",
vec![L, W("func"), C, S, DU, W("key"), DU, E, W("value"), R]);
vec![L, T("func"), C, S, DU, T("key"), DU, E, T("value"), R]);
}
/// This test has a special look at the double underscore syntax, because
@ -653,16 +653,16 @@ mod token_tests {
#[test]
fn tokenize_double_underscore() {
test("he__llo__world_ _ __ Now this_ is__ special!",
vec![W("he"), DU, W("llo"), DU, W("world_"), S, W("_"), S, DU, S, W("Now"), S,
W("this_"), S, W("is"), DU, S, W("special"), W("!")]);
vec![T("he"), DU, T("llo"), DU, T("world"), T("_"), S, T("_"), S, DU, S, T("Now"), S,
T("this"), T("_"), S, T("is"), DU, S, T("special!")]);
}
/// This test is for checking if non-ASCII characters get parsed correctly.
#[test]
fn tokenize_unicode() {
test("[document][Hello 🌍!]",
vec![L, W("document"), R, L, W("Hello"), S, W("🌍"), W("!"), R]);
test("[f]⺐.", vec![L, W("f"), R, W(""), W(".")]);
vec![L, T("document"), R, L, T("Hello"), S, T("🌍!"), R]);
test("[f]⺐.", vec![L, T("f"), R, T(".")]);
}
}
@ -674,7 +674,7 @@ mod parse_tests {
use Node::{Space as S, Newline as N, Func as F};
#[allow(non_snake_case)]
fn W(s: &str) -> Node { Node::Word(s.to_owned()) }
fn T(s: &str) -> Node { Node::Text(s.to_owned()) }
/// A testing function which just parses it's body into a syntax tree.
#[derive(Debug, PartialEq)]
@ -764,19 +764,19 @@ mod parse_tests {
#[test]
fn parse_base() {
test("", tree! []);
test("Hello World!", tree! [ W("Hello"), S, W("World"), W("!") ]);
test("Hello World!", tree! [ T("Hello"), S, T("World!") ]);
}
/// Test whether newlines generate the correct whitespace.
#[test]
fn parse_newlines_whitespace() {
test("Hello\nWorld", tree! [ W("Hello"), S, W("World") ]);
test("Hello \n World", tree! [ W("Hello"), S, W("World") ]);
test("Hello\n\nWorld", tree! [ W("Hello"), N, W("World") ]);
test("Hello \n\nWorld", tree! [ W("Hello"), S, N, W("World") ]);
test("Hello\n\n World", tree! [ W("Hello"), N, S, W("World") ]);
test("Hello \n \n \n World", tree! [ W("Hello"), S, N, S, W("World") ]);
test("Hello\n \n\n World", tree! [ W("Hello"), S, N, S, W("World") ]);
test("Hello\nWorld", tree! [ T("Hello"), S, T("World") ]);
test("Hello \n World", tree! [ T("Hello"), S, T("World") ]);
test("Hello\n\nWorld", tree! [ T("Hello"), N, T("World") ]);
test("Hello \n\nWorld", tree! [ T("Hello"), S, N, T("World") ]);
test("Hello\n\n World", tree! [ T("Hello"), N, S, T("World") ]);
test("Hello \n \n \n World", tree! [ T("Hello"), S, N, S, T("World") ]);
test("Hello\n \n\n World", tree! [ T("Hello"), S, N, S, T("World") ]);
}
/// Parse things dealing with functions.
@ -790,18 +790,18 @@ mod parse_tests {
test_scoped(&scope,"[test]", tree! [ F(func! { name => "test", body => None }) ]);
test_scoped(&scope, "This is an [modifier][example] of a function invocation.", tree! [
W("This"), S, W("is"), S, W("an"), S,
F(func! { name => "modifier", body => tree! [ W("example") ] }), S,
W("of"), S, W("a"), S, W("function"), S, W("invocation"), W(".")
T("This"), S, T("is"), S, T("an"), S,
F(func! { name => "modifier", body => tree! [ T("example") ] }), S,
T("of"), S, T("a"), S, T("function"), S, T("invocation.")
]);
test_scoped(&scope, "[func][Hello][modifier][Here][end]", tree! [
F(func! {
name => "func",
body => tree! [ W("Hello") ],
body => tree! [ T("Hello") ],
}),
F(func! {
name => "modifier",
body => tree! [ W("Here") ],
body => tree! [ T("Here") ],
}),
F(func! {
name => "end",
@ -820,11 +820,11 @@ mod parse_tests {
body => tree! [
F(func! {
name => "func",
body => tree! [ W("call") ],
body => tree! [ T("call") ],
}),
],
}),
S, W("outside")
S, T("outside")
]);
}
@ -839,12 +839,12 @@ mod parse_tests {
name => "func",
body => None,
}),
S, W(""), W(".")
S, T(".")
]);
test_scoped(&scope, "[bold][Hello 🌍!]", tree! [
F(func! {
name => "bold",
body => tree! [ W("Hello"), S, W("🌍"), W("!") ],
body => tree! [ T("Hello"), S, T("🌍!") ],
})
]);
}

View File

@ -30,8 +30,8 @@ pub enum Token<'s> {
Dollar,
/// A hashtag starting a _comment_.
Hashtag,
/// Everything else just is a literal word.
Word(&'s str),
/// Everything else is just text.
Text(&'s str),
}
/// A tree representation of the source.
@ -62,8 +62,8 @@ pub enum Node {
ToggleBold,
/// Indicates that math mode was enabled/disabled.
ToggleMath,
/// A literal word.
Word(String),
/// Literal text.
Text(String),
/// A function invocation.
Func(FuncCall),
}

View File

@ -1,135 +0,0 @@
//! Utility functionality.
use std::iter::Peekable;
use std::str::Split;
use unicode_xid::UnicodeXID;
/// Types that can be splined.
pub trait Splinor {
/// Returns an iterator over the substrings splitted by the pattern,
/// intertwined with the splinor.
///
/// # Example
///
/// ```ignore
/// #[derive(Debug, Copy, Clone, PartialEq)]
/// struct Space;
///
/// let v: Vec<Splined<Space>> = "My airplane flies!".spline(" ", Space).collect();
/// assert_eq!(v, [
/// Splined::Value("My"),
/// Splined::Splinor(Space),
/// Splined::Value("airplane"),
/// Splined::Splinor(Space),
/// Splined::Value("flies!"),
/// ]);
/// ```
fn spline<'s, T: Clone>(&'s self, pat: &'s str, splinor: T) -> Spline<'s, T>;
}
impl Splinor for str {
fn spline<'s, T: Clone>(&'s self, pat: &'s str, splinor: T) -> Spline<'s, T> {
Spline {
splinor: Splined::Splinor(splinor),
split: self.split(pat).peekable(),
next_splinor: false,
}
}
}
/// Iterator over splitted values and splinors.
///
/// Created by the [`spline`](Splinor::spline) function.
#[derive(Debug, Clone)]
pub struct Spline<'s, T> {
splinor: Splined<'s, T>,
split: Peekable<Split<'s, &'s str>>,
next_splinor: bool,
}
/// Represents either a splitted substring or a splinor.
#[derive(Debug, Copy, Clone, Eq, PartialEq)]
pub enum Splined<'s, T> {
/// A substring.
Value(&'s str),
/// An intertwined splinor.
Splinor(T),
}
impl<'s, T: Clone> Iterator for Spline<'s, T> {
type Item = Splined<'s, T>;
fn next(&mut self) -> Option<Splined<'s, T>> {
if self.next_splinor && self.split.peek().is_some() {
self.next_splinor = false;
return Some(self.splinor.clone());
} else {
self.next_splinor = true;
return Some(Splined::Value(self.split.next()?))
}
}
}
/// More useful functions on `str`'s.
pub trait StrExt {
/// Whether self consists only of whitespace.
fn is_whitespace(&self) -> bool;
/// Whether this word is a valid unicode identifier.
fn is_identifier(&self) -> bool;
}
impl StrExt for str {
fn is_whitespace(&self) -> bool {
self.chars().all(|c| c.is_whitespace() && c != '\n')
}
fn is_identifier(&self) -> bool {
let mut chars = self.chars();
match chars.next() {
Some(c) if !UnicodeXID::is_xid_start(c) => return false,
None => return false,
_ => (),
}
while let Some(c) = chars.next() {
if !UnicodeXID::is_xid_continue(c) {
return false;
}
}
true
}
}
#[cfg(test)]
mod splinor_tests {
use super::*;
use Splined::{Value as V, Splinor as S};
#[derive(Debug, Copy, Clone, PartialEq)]
enum Token { DoubleUnderscore }
fn test<T>(string: &str, pat: &str, splinor: T, vec: Vec<Splined<T>>)
where T: std::fmt::Debug + Clone + PartialEq {
assert_eq!(string.spline(pat, splinor).collect::<Vec<_>>(), vec);
}
#[test]
fn splinor() {
let s = S(Token::DoubleUnderscore);
test("__he__llo__world__", "__", Token::DoubleUnderscore,
vec![V(""), s, V("he"), s, V("llo"), s, V("world"), s, V("")]);
test("__Italic__", "__", Token::DoubleUnderscore,
vec![V(""), s, V("Italic"), s, V("")]);
test("Key__Value", "__", Token::DoubleUnderscore,
vec![V("Key"), s, V("Value")]);
test("__Start__NoEnd", "__", Token::DoubleUnderscore,
vec![V(""), s, V("Start"), s, V("NoEnd")]);
test("NoStart__End__", "__", Token::DoubleUnderscore,
vec![V("NoStart"), s, V("End"), s, V("")]);
}
}