From c2b6f2dc359d3b5c5b09996b8902c09e27271b4c Mon Sep 17 00:00:00 2001 From: Martin Haug Date: Sat, 29 Aug 2020 13:53:59 +0200 Subject: [PATCH 1/3] =?UTF-8?q?Added=20code=20blocks=20=F0=9F=9A=9F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/layout/tree.rs | 18 ++++- src/syntax/parsing.rs | 162 ++++++++++++++++++++++++++++++++++++++++-- src/syntax/tokens.rs | 80 ++++++++++++++++++++- src/syntax/tree.rs | 8 +++ 4 files changed, 261 insertions(+), 7 deletions(-) diff --git a/src/layout/tree.rs b/src/layout/tree.rs index e500c4ba2..f039d9b09 100644 --- a/src/layout/tree.rs +++ b/src/layout/tree.rs @@ -3,7 +3,7 @@ use crate::style::LayoutStyle; use crate::syntax::decoration::Decoration; use crate::syntax::span::{Span, Spanned}; -use crate::syntax::tree::{CallExpr, SyntaxNode, SyntaxTree}; +use crate::syntax::tree::{CallExpr, SyntaxNode, SyntaxTree, CodeBlockExpr}; use crate::{DynFuture, Feedback, Pass}; use super::line::{LineContext, LineLayouter}; use super::text::{layout_text, TextContext}; @@ -80,6 +80,7 @@ impl<'a> TreeLayouter<'a> { } SyntaxNode::Raw(lines) => self.layout_raw(lines).await, + SyntaxNode::CodeBlock(block) => self.layout_code(block).await, SyntaxNode::Par(par) => self.layout_par(par).await, SyntaxNode::Call(call) => { self.layout_call(Spanned::new(call, node.span)).await; @@ -128,6 +129,21 @@ impl<'a> TreeLayouter<'a> { self.style.text.fallback = fallback; } + async fn layout_code(&mut self, block: &CodeBlockExpr) { + let fallback = self.style.text.fallback.clone(); + self.style.text.fallback + .list_mut() + .insert(0, "monospace".to_string()); + self.style.text.fallback.flatten(); + + for line in &block.raw { + self.layout_text(line).await; + self.layouter.finish_line(); + } + + self.style.text.fallback = fallback; + } + async fn layout_par(&mut self, par: &SyntaxTree) { self.layout_tree(par).await; self.layouter.add_secondary_spacing( diff --git a/src/syntax/parsing.rs b/src/syntax/parsing.rs index 29a9d788f..e9bbf2e58 100644 --- a/src/syntax/parsing.rs +++ b/src/syntax/parsing.rs @@ -7,8 +7,15 @@ use crate::color::RgbaColor; use crate::compute::table::SpannedEntry; use super::decoration::Decoration; use super::span::{Pos, Span, Spanned}; -use super::tokens::{is_newline_char, Token, TokenMode, Tokens}; -use super::tree::{CallExpr, Expr, SyntaxNode, SyntaxTree, TableExpr}; +use super::tokens::{is_newline_char, Token, TokenMode, Tokens, is_identifier}; +use super::tree::{ + CallExpr, + Expr, + SyntaxNode, + SyntaxTree, + TableExpr, + CodeBlockExpr, +}; use super::Ident; /// Parse a string of source code. @@ -84,6 +91,34 @@ impl Parser<'_> { self.with_span(SyntaxNode::Raw(unescape_raw(raw))) } + Token::Code { lang, raw, terminated } => { + if !terminated { + error!( + @self.feedback, Span::at(token.span.end), + "expected code block to close", + ); + } + let mut valid_ident = false; + let mut lang = lang.map(|s| s.map(|v| { + if is_identifier(v) { + valid_ident = true; + } + Ident(v.to_string()) + })); + + if !valid_ident { + if let Some(l) = lang { + error!( + @self.feedback, l.span, + "expected language to be a valid identifier", + ); + } + lang = None; + } + + self.with_span(SyntaxNode::CodeBlock(CodeBlockExpr { raw: unescape_code(raw), lang })) + } + Token::Text(text) => { self.with_span(SyntaxNode::Text(text.to_string())) } @@ -627,6 +662,84 @@ fn unescape_raw(raw: &str) -> Vec { lines } +/// Unescape raw markup and split it into into lines. +fn unescape_code(raw: &str) -> Vec { + let mut iter = raw.chars().peekable(); + let mut line = String::new(); + let mut lines = Vec::new(); + let mut backticks: usize = 0; + + // This assignment is used in line 731, 733; + // the compiler does not want to acknowledge that, however. + #[allow(unused_assignments)] + let mut update_backtick_count = true; + + while let Some(c) = iter.next() { + update_backtick_count = true; + if is_newline_char(c) { + if c == '\r' && iter.peek() == Some(&'\n') { + iter.next(); + } + + lines.push(std::mem::take(&mut line)); + } else { + if c == '\\' && backticks > 0 { + let mut tail = String::new(); + let mut escape_success = false; + + let mut backticks_after_slash: u8 = 0; + + while let Some(&s) = iter.peek() { + match s { + '\\' => { + if backticks_after_slash == 0 { + tail.push(s); + } else { + // Pattern like `\`\` should fail + // escape and just be printed verbantim. + break; + } + } + '`' => { + tail.push(s); + backticks_after_slash += 1; + if backticks_after_slash == 2 { + escape_success = true; + iter.next(); + break; + } + } + _ => { break } + } + + iter.next(); + } + + if !escape_success { + line.push(c); + backticks = backticks_after_slash as usize; + update_backtick_count = false; + } else { + backticks = 0; + } + + line.push_str(&tail); + } else { + line.push(c); + } + } + + if update_backtick_count && c == '`' { + backticks += 1; + } else if update_backtick_count { + backticks = 0; + } + } + + lines.push(line); + lines +} + #[cfg(test)] #[allow(non_snake_case)] mod tests { @@ -652,6 +765,14 @@ mod tests { }; } + fn Lang(text: &str) -> Option> { Some(Spanned::zero(Ident(text.to_string()))) } + + macro_rules! C { + ($lang:expr, $($line:expr),* $(,)?) => { + SyntaxNode::CodeBlock(CodeBlockExpr { raw: vec![$($line.to_string()) ,*], lang: $lang }) + }; + } + macro_rules! P { ($($tts:tt)*) => { SyntaxNode::Par(Tree![@$($tts)*]) }; } @@ -799,6 +920,28 @@ mod tests { test("raw\\", vec!["raw\\"]); } + #[test] + fn test_unescape_code() { + fn test(raw: &str, expected: Vec<&str>) { + assert_eq!(unescape_code(raw), expected); + } + + test("code\\`", vec!["code\\`"]); + test("code`\\``", vec!["code```"]); + test("code`\\`a", vec!["code`\\`a"]); + test("code``hi`\\``", vec!["code``hi```"]); + test("code`\\\\``", vec!["code`\\``"]); + test("code`\\`\\`go", vec!["code`\\`\\`go"]); + test("code`\\`\\``", vec!["code`\\```"]); + test("code\ntext", vec!["code", "text"]); + test("a\r\nb", vec!["a", "b"]); + test("a\n\nb", vec!["a", "", "b"]); + test("a\r\x0Bb", vec!["a", "", "b"]); + test("a\r\n\r\nb", vec!["a", "", "b"]); + test("code\\a", vec!["code\\a"]); + test("code\\", vec!["code\\"]); + } + #[test] fn test_parse_simple_nodes() { t!("" => ); @@ -811,8 +954,19 @@ mod tests { t!("`py`" => P![R!["py"]]); t!("`hi\nyou" => P![R!["hi", "you"]]); e!("`hi\nyou" => s(1,3, 1,3, "expected backtick")); - t!("`hi\\`du`" => P![R!["hi`du"]]); - t!("💜\n\n 🌍" => P![T("💜")], P![T("🌍")]); + t!("`hi\\`du`" => P![R!["hi`du"]]); + t!("```java System.out.print```" => P![ + C![Lang("java"), "System.out.print"] + ]); + t!("``` console.log(\n\"alert\"\n)" => P![ + C![None, "console.log(", "\"alert\"", ")"] + ]); + t!("```typst \r\n Typst uses `\\`` to indicate code blocks" => P![ + C![Lang("typst"), " Typst uses ``` to indicate code blocks"] + ]); + e!("``` hi\nyou" => s(1,3, 1,3, "expected code block to close")); + e!("```🌍 hi\nyou```" => s(0,3, 0,4, "expected language to be a valid identifier")); + t!("💜\n\n 🌍" => P![T("💜")], P![T("🌍")]); ts!("hi" => s(0,0, 0,2, P![s(0,0, 0,2, T("hi"))])); ts!("*Hi*" => s(0,0, 0,4, P![ diff --git a/src/syntax/tokens.rs b/src/syntax/tokens.rs index 1dcf9022e..dbba175dc 100644 --- a/src/syntax/tokens.rs +++ b/src/syntax/tokens.rs @@ -90,6 +90,16 @@ pub enum Token<'s> { terminated: bool, }, + /// Multi-line code block. + Code { + /// The language of the code block, if specified. + lang: Option>, + /// The raw text (not yet unescaped as for strings). + raw: &'s str, + /// Whether the closing backticks were present. + terminated: bool, + }, + /// Any other consecutive string. Text(&'s str), @@ -127,6 +137,7 @@ impl<'s> Token<'s> { Underscore => "underscore", Backslash => "backslash", Raw { .. } => "raw text", + Code { .. } => "code block", Text(_) => "text", Invalid("*/") => "end of block comment", Invalid(_) => "invalid token", @@ -241,7 +252,7 @@ impl<'s> Iterator for Tokens<'s> { // Style toggles. '_' if self.mode == Body => Underscore, - '`' if self.mode == Body => self.read_raw(), + '`' if self.mode == Body => self.read_raw_and_code(), // An escaped thing. '\\' if self.mode == Body => self.read_escaped(), @@ -330,8 +341,65 @@ impl<'s> Tokens<'s> { Str { string, terminated } } - fn read_raw(&mut self) -> Token<'s> { + fn read_raw_and_code(&mut self) -> Token<'s> { let (raw, terminated) = self.read_until_unescaped('`'); + if raw.len() == 0 && terminated && self.peek() == Some('`') { + // Third tick found; this is a code block + self.eat(); + let mut backticks = 0; + let mut terminated = true; + // Reads the lang tag (until newline or whitespace) + let lang_start = self.pos(); + let (lang_opt, _) = self.read_string_until( + |c| c == '`' || c.is_whitespace() || is_newline_char(c), + false, 0, 0); + let lang_end = self.pos(); + + #[derive(Debug, PartialEq)] + enum WhitespaceIngestion { All, ExceptNewline, Never } + let mut ingest_whitespace = WhitespaceIngestion::Never; + let mut start = self.index(); + + while backticks < 3 { + match self.eat() { + Some('`') => backticks += 1, + Some('\\') if backticks == 1 && self.peek() == Some('`') => { + backticks = 0; + } + Some(c) => { + // Remove whitespace between language and content or + // first line break, deal with CRLF and CR line endings. + if ingest_whitespace != WhitespaceIngestion::All + && c == '\n' { + start += 1; + ingest_whitespace = WhitespaceIngestion::All; + } else if ingest_whitespace != WhitespaceIngestion::All + && c == '\r' { + start += 1; + ingest_whitespace = WhitespaceIngestion::ExceptNewline; + } else if ingest_whitespace == WhitespaceIngestion::Never + && c.is_whitespace() { + start += 1; + } else { + ingest_whitespace = WhitespaceIngestion::All; + } + } + None => { + terminated = false; + break; + } + } + } + let end = self.index() - (if terminated { 3 } else { 0 }); + + return Code { + lang: if lang_opt.len() == 0 { None } else { + Some(Spanned::new(lang_opt, Span::new(lang_start, lang_end))) + }, + raw: &self.src[start..end], + terminated + } + } Raw { raw, terminated } } @@ -494,6 +562,7 @@ mod tests { use crate::length::Length; use crate::syntax::tests::*; use super::*; + use super::super::span::Spanned; use Token::{ Space as S, LineComment as LC, BlockComment as BC, @@ -515,6 +584,9 @@ mod tests { fn Str(string: &str, terminated: bool) -> Token { Token::Str { string, terminated } } fn Raw(raw: &str, terminated: bool) -> Token { Token::Raw { raw, terminated } } + fn Code<'a>(lang: Option<&'a str>, raw: &'a str, terminated: bool) -> Token<'a> { + Token::Code { lang: lang.map(Spanned::zero), raw, terminated } + } macro_rules! t { ($($tts:tt)*) => {test!(@spans=false, $($tts)*)} } macro_rules! ts { ($($tts:tt)*) => {test!(@spans=true, $($tts)*)} } @@ -568,6 +640,10 @@ mod tests { t!(Body, "`[func]`" => Raw("[func]", true)); t!(Body, "`]" => Raw("]", false)); t!(Body, "`\\``" => Raw("\\`", true)); + t!(Body, "``not code`" => Raw("", true), T("not"), S(0), T("code"), Raw("", false)); + t!(Body, "```rust hi```" => Code(Some("rust"), "hi", true)); + t!(Body, "``` hi`\\``" => Code(None, "hi`\\``", false)); + t!(Body, "```js \r\n document.write(\"go\")" => Code(Some("js"), " document.write(\"go\")", false)); t!(Body, "\\ " => Backslash, S(0)); t!(Header, "_`" => Invalid("_`")); } diff --git a/src/syntax/tree.rs b/src/syntax/tree.rs index ae2e98920..ace5ad8ec 100644 --- a/src/syntax/tree.rs +++ b/src/syntax/tree.rs @@ -31,6 +31,8 @@ pub enum SyntaxNode { Text(String), /// Lines of raw text. Raw(Vec), + /// An optionally highlighted multi-line code block. + CodeBlock(CodeBlockExpr), /// A paragraph of child nodes. Par(SyntaxTree), /// A function call. @@ -199,3 +201,9 @@ impl CallExpr { } } } +/// An code block. +#[derive(Debug, Clone, PartialEq)] +pub struct CodeBlockExpr { + pub lang: Option>, + pub raw: Vec, +} From 1eb584e256a3ce780029c7ab55c9e5891d05df3a Mon Sep 17 00:00:00 2001 From: Martin Haug Date: Sat, 29 Aug 2020 14:10:56 +0200 Subject: [PATCH 2/3] =?UTF-8?q?Fixing=20the=20build=20by=20removing=20P=20?= =?UTF-8?q?macro=20usage=20=F0=9F=9A=91?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Kids, merge with caution! --- src/syntax/parsing.rs | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/syntax/parsing.rs b/src/syntax/parsing.rs index 0e6fba5e6..76509faee 100644 --- a/src/syntax/parsing.rs +++ b/src/syntax/parsing.rs @@ -761,7 +761,7 @@ mod tests { SyntaxNode::CodeBlock(CodeBlockExpr { raw: vec![$($line.to_string()) ,*], lang: $lang }) }; } - + macro_rules! F { ($($tts:tt)*) => { SyntaxNode::Call(Call!(@$($tts)*)) } } @@ -941,18 +941,18 @@ mod tests { e!("`hi\nyou" => s(1,3, 1,3, "expected backtick")); t!("`hi\\`du`" => R!["hi`du"]); - t!("```java System.out.print```" => P![ - C![Lang("java"), "System.out.print"] + t!("```java System.out.print```" => C![ + Lang("java"), "System.out.print" ]); - t!("``` console.log(\n\"alert\"\n)" => P![ - C![None, "console.log(", "\"alert\"", ")"] + t!("``` console.log(\n\"alert\"\n)" => C![ + None, "console.log(", "\"alert\"", ")" ]); - t!("```typst \r\n Typst uses `\\`` to indicate code blocks" => P![ - C![Lang("typst"), " Typst uses ``` to indicate code blocks"] + t!("```typst \r\n Typst uses `\\`` to indicate code blocks" => C![ + Lang("typst"), " Typst uses ``` to indicate code blocks" ]); e!("``` hi\nyou" => s(1,3, 1,3, "expected code block to close")); e!("```🌍 hi\nyou```" => s(0,3, 0,4, "expected language to be a valid identifier")); - t!("💜\n\n 🌍" => P![T("💜")], P![T("🌍")]); + t!("💜\n\n 🌍" => T("💜"), P, T("🌍")); ts!("hi" => s(0,0, 0,2, T("hi"))); ts!("*Hi*" => s(0,0, 0,1, B), s(0,1, 0,3, T("Hi")), s(0,3, 0,4, B)); From d68367f32a9e698923b554984c59f0671e27ba5f Mon Sep 17 00:00:00 2001 From: Martin Haug Date: Sat, 29 Aug 2020 17:20:04 +0200 Subject: [PATCH 3/3] =?UTF-8?q?Newlines=20are=20complicated,=20y'all=20?= =?UTF-8?q?=F0=9F=98=B1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: laurmaedje@outlook.de --- src/layout/tree.rs | 35 +++---- src/syntax/parsing.rs | 228 ++++++++++++++++++++++-------------------- src/syntax/tokens.rs | 89 +++++++++-------- src/syntax/tree.rs | 11 +- 4 files changed, 188 insertions(+), 175 deletions(-) diff --git a/src/layout/tree.rs b/src/layout/tree.rs index 714cfe274..16a2930ae 100644 --- a/src/layout/tree.rs +++ b/src/layout/tree.rs @@ -3,7 +3,7 @@ use crate::style::LayoutStyle; use crate::syntax::decoration::Decoration; use crate::syntax::span::{Span, Spanned}; -use crate::syntax::tree::{CallExpr, SyntaxNode, SyntaxTree, CodeBlockExpr}; +use crate::syntax::tree::{CallExpr, SyntaxNode, SyntaxTree, Code}; use crate::{DynFuture, Feedback, Pass}; use super::line::{LineContext, LineLayouter}; use super::text::{layout_text, TextContext}; @@ -63,10 +63,7 @@ impl<'a> TreeLayouter<'a> { match &node.v { SyntaxNode::Spacing => self.layout_space(), SyntaxNode::Linebreak => self.layouter.finish_line(), - SyntaxNode::Parbreak => self.layouter.add_secondary_spacing( - self.style.text.paragraph_spacing(), - SpacingKind::PARAGRAPH, - ), + SyntaxNode::Parbreak => self.layout_parbreak(), SyntaxNode::ToggleItalic => { self.style.text.italic = !self.style.text.italic; @@ -84,7 +81,7 @@ impl<'a> TreeLayouter<'a> { } SyntaxNode::Raw(lines) => self.layout_raw(lines).await, - SyntaxNode::CodeBlock(block) => self.layout_code(block).await, + SyntaxNode::Code(block) => self.layout_code(block).await, SyntaxNode::Call(call) => { self.layout_call(Spanned::new(call, node.span)).await; @@ -99,6 +96,13 @@ impl<'a> TreeLayouter<'a> { ); } + fn layout_parbreak(&mut self) { + self.layouter.add_secondary_spacing( + self.style.text.paragraph_spacing(), + SpacingKind::PARAGRAPH, + ); + } + async fn layout_text(&mut self, text: &str) { self.layouter.add( layout_text( @@ -133,19 +137,16 @@ impl<'a> TreeLayouter<'a> { self.style.text.fallback = fallback; } - async fn layout_code(&mut self, block: &CodeBlockExpr) { - let fallback = self.style.text.fallback.clone(); - self.style.text.fallback - .list_mut() - .insert(0, "monospace".to_string()); - self.style.text.fallback.flatten(); - - for line in &block.raw { - self.layout_text(line).await; - self.layouter.finish_line(); + async fn layout_code(&mut self, code: &Code) { + if code.block { + self.layout_parbreak(); } - self.style.text.fallback = fallback; + self.layout_raw(&code.lines).await; + + if code.block { + self.layout_parbreak() + } } async fn layout_call(&mut self, call: Spanned<&CallExpr>) { diff --git a/src/syntax/parsing.rs b/src/syntax/parsing.rs index 76509faee..0d12f6e18 100644 --- a/src/syntax/parsing.rs +++ b/src/syntax/parsing.rs @@ -7,14 +7,9 @@ use crate::color::RgbaColor; use crate::compute::table::SpannedEntry; use super::decoration::Decoration; use super::span::{Pos, Span, Spanned}; -use super::tokens::{is_newline_char, Token, TokenMode, Tokens, is_identifier}; +use super::tokens::{is_newline_char, Token, TokenMode, Tokens}; use super::tree::{ - CallExpr, - Expr, - SyntaxNode, - SyntaxTree, - TableExpr, - CodeBlockExpr, + CallExpr, Expr, SyntaxNode, SyntaxTree, TableExpr, Code, }; use super::Ident; @@ -88,28 +83,27 @@ impl Parser<'_> { if !terminated { error!( @self.feedback, Span::at(token.span.end), - "expected code block to close", + "expected backticks", ); } - let mut valid_ident = false; - let mut lang = lang.map(|s| s.map(|v| { - if is_identifier(v) { - valid_ident = true; - } - Ident(v.to_string()) - })); - if !valid_ident { - if let Some(l) = lang { - error!( - @self.feedback, l.span, - "expected language to be a valid identifier", - ); + let lang = lang.and_then(|lang| { + if let Some(ident) = Ident::new(lang.v) { + Some(Spanned::new(ident, lang.span)) + } else { + error!(@self.feedback, lang.span, "invalid identifier"); + None } - lang = None; + }); + + let mut lines = unescape_code(raw); + let block = lines.len() > 1; + + if lines.last().map(|s| s.is_empty()).unwrap_or(false) { + lines.pop(); } - self.with_span(SyntaxNode::CodeBlock(CodeBlockExpr { raw: unescape_code(raw), lang })) + self.with_span(SyntaxNode::Code(Code { lang, lines, block })) } Token::Text(text) => { @@ -624,45 +618,99 @@ fn unescape_string(string: &str) -> String { /// Unescape raw markup and split it into into lines. fn unescape_raw(raw: &str) -> Vec { let mut iter = raw.chars().peekable(); - let mut line = String::new(); - let mut lines = Vec::new(); + let mut text = String::new(); while let Some(c) = iter.next() { if c == '\\' { - match iter.next() { - Some('`') => line.push('`'), - Some(c) => { line.push('\\'); line.push(c); } - None => line.push('\\'), - } - } else if is_newline_char(c) { - if c == '\r' && iter.peek() == Some(&'\n') { - iter.next(); - } + if let Some(c) = iter.next() { + if c != '\\' && c != '`' { + text.push('\\'); + } - lines.push(std::mem::take(&mut line)); + text.push(c); + } else { + text.push('\\'); + } } else { - line.push(c); + text.push(c); } } - lines.push(line); - lines + split_lines(&text) } /// Unescape raw markup and split it into into lines. fn unescape_code(raw: &str) -> Vec { let mut iter = raw.chars().peekable(); - let mut line = String::new(); - let mut lines = Vec::new(); - let mut backticks: usize = 0; - - // This assignment is used in line 731, 733; - // the compiler does not want to acknowledge that, however. - #[allow(unused_assignments)] - let mut update_backtick_count = true; + let mut text = String::new(); + let mut backticks = 0u32; + let mut update_backtick_count; while let Some(c) = iter.next() { update_backtick_count = true; + + if c == '\\' && backticks > 0 { + let mut tail = String::new(); + let mut escape_success = false; + let mut backticks_after_slash = 0u32; + + while let Some(&s) = iter.peek() { + match s { + '\\' => { + if backticks_after_slash == 0 { + tail.push('\\'); + } else { + // Pattern like `\`\` should fail + // escape and just be printed verbantim. + break; + } + } + '`' => { + tail.push(s); + backticks_after_slash += 1; + if backticks_after_slash == 2 { + escape_success = true; + iter.next(); + break; + } + } + _ => break, + } + + iter.next(); + } + + if !escape_success { + text.push(c); + backticks = backticks_after_slash; + update_backtick_count = false; + } else { + backticks = 0; + } + + text.push_str(&tail); + } else { + text.push(c); + } + + if update_backtick_count { + if c == '`' { + backticks += 1; + } else { + backticks = 0; + } + } + } + + split_lines(&text) +} + +fn split_lines(text: &str) -> Vec { + let mut iter = text.chars().peekable(); + let mut line = String::new(); + let mut lines = Vec::new(); + + while let Some(c) = iter.next() { if is_newline_char(c) { if c == '\r' && iter.peek() == Some(&'\n') { iter.next(); @@ -670,56 +718,7 @@ fn unescape_code(raw: &str) -> Vec { lines.push(std::mem::take(&mut line)); } else { - if c == '\\' && backticks > 0 { - let mut tail = String::new(); - let mut escape_success = false; - - let mut backticks_after_slash: u8 = 0; - - while let Some(&s) = iter.peek() { - match s { - '\\' => { - if backticks_after_slash == 0 { - tail.push(s); - } else { - // Pattern like `\`\` should fail - // escape and just be printed verbantim. - break; - } - } - '`' => { - tail.push(s); - backticks_after_slash += 1; - if backticks_after_slash == 2 { - escape_success = true; - iter.next(); - break; - } - } - _ => { break } - } - - iter.next(); - } - - if !escape_success { - line.push(c); - backticks = backticks_after_slash as usize; - update_backtick_count = false; - } else { - backticks = 0; - } - - line.push_str(&tail); - } else { - line.push(c); - } - } - - if update_backtick_count && c == '`' { - backticks += 1; - } else if update_backtick_count { - backticks = 0; + line.push(c); } } @@ -753,13 +752,23 @@ mod tests { }; } - - fn Lang(text: &str) -> Option> { Some(Spanned::zero(Ident(text.to_string()))) } - macro_rules! C { - ($lang:expr, $($line:expr),* $(,)?) => { - SyntaxNode::CodeBlock(CodeBlockExpr { raw: vec![$($line.to_string()) ,*], lang: $lang }) - }; + (None, $($line:expr),* $(,)?) => {{ + let lines = vec![$($line.to_string()) ,*]; + SyntaxNode::Code(Code { + lang: None, + block: lines.len() > 1, + lines, + }) + }}; + (Some($lang:expr), $($line:expr),* $(,)?) => {{ + let lines = vec![$($line.to_string()) ,*]; + SyntaxNode::Code(Code { + lang: Some(Into::>::into($lang).map(|s| Ident(s.to_string()))), + block: lines.len() > 1, + lines, + }) + }}; } macro_rules! F { @@ -896,6 +905,7 @@ mod tests { } test("raw\\`", vec!["raw`"]); + test("raw\\\\`", vec!["raw\\`"]); test("raw\ntext", vec!["raw", "text"]); test("a\r\nb", vec!["a", "b"]); test("a\n\nb", vec!["a", "", "b"]); @@ -942,16 +952,16 @@ mod tests { t!("`hi\\`du`" => R!["hi`du"]); t!("```java System.out.print```" => C![ - Lang("java"), "System.out.print" - ]); + Some("java"), "System.out.print" + ]); t!("``` console.log(\n\"alert\"\n)" => C![ None, "console.log(", "\"alert\"", ")" - ]); + ]); t!("```typst \r\n Typst uses `\\`` to indicate code blocks" => C![ - Lang("typst"), " Typst uses ``` to indicate code blocks" - ]); - e!("``` hi\nyou" => s(1,3, 1,3, "expected code block to close")); - e!("```🌍 hi\nyou```" => s(0,3, 0,4, "expected language to be a valid identifier")); + Some("typst"), " Typst uses ``` to indicate code blocks" + ]); + e!("``` hi\nyou" => s(1,3, 1,3, "expected backticks")); + e!("```🌍 hi\nyou```" => s(0,3, 0,4, "invalid identifier")); t!("💜\n\n 🌍" => T("💜"), P, T("🌍")); ts!("hi" => s(0,0, 0,2, T("hi"))); diff --git a/src/syntax/tokens.rs b/src/syntax/tokens.rs index dbba175dc..7ecb05fe4 100644 --- a/src/syntax/tokens.rs +++ b/src/syntax/tokens.rs @@ -252,7 +252,7 @@ impl<'s> Iterator for Tokens<'s> { // Style toggles. '_' if self.mode == Body => Underscore, - '`' if self.mode == Body => self.read_raw_and_code(), + '`' if self.mode == Body => self.read_raw_or_code(), // An escaped thing. '\\' if self.mode == Body => self.read_escaped(), @@ -341,66 +341,67 @@ impl<'s> Tokens<'s> { Str { string, terminated } } - fn read_raw_and_code(&mut self) -> Token<'s> { + fn read_raw_or_code(&mut self) -> Token<'s> { let (raw, terminated) = self.read_until_unescaped('`'); - if raw.len() == 0 && terminated && self.peek() == Some('`') { - // Third tick found; this is a code block + if raw.is_empty() && terminated && self.peek() == Some('`') { + // Third tick found; this is a code block. self.eat(); - let mut backticks = 0; - let mut terminated = true; - // Reads the lang tag (until newline or whitespace) - let lang_start = self.pos(); - let (lang_opt, _) = self.read_string_until( - |c| c == '`' || c.is_whitespace() || is_newline_char(c), - false, 0, 0); - let lang_end = self.pos(); - #[derive(Debug, PartialEq)] - enum WhitespaceIngestion { All, ExceptNewline, Never } - let mut ingest_whitespace = WhitespaceIngestion::Never; - let mut start = self.index(); + // Reads the lang tag (until newline or whitespace). + let start = self.pos(); + let lang = self.read_string_until( + |c| c == '`' || c.is_whitespace() || is_newline_char(c), + false, 0, 0, + ).0; + let end = self.pos(); + let lang = if !lang.is_empty() { + Some(Spanned::new(lang, Span::new(start, end))) + } else { + None + }; + + // Skip to start of raw contents. + while let Some(c) = self.peek() { + if is_newline_char(c) { + self.eat(); + if c == '\r' && self.peek() == Some('\n') { + self.eat(); + } + + break; + } else if c.is_whitespace() { + self.eat(); + } else { + break; + } + } + + let start = self.index(); + let mut backticks = 0u32; while backticks < 3 { match self.eat() { Some('`') => backticks += 1, + // Escaping of triple backticks. Some('\\') if backticks == 1 && self.peek() == Some('`') => { backticks = 0; } - Some(c) => { - // Remove whitespace between language and content or - // first line break, deal with CRLF and CR line endings. - if ingest_whitespace != WhitespaceIngestion::All - && c == '\n' { - start += 1; - ingest_whitespace = WhitespaceIngestion::All; - } else if ingest_whitespace != WhitespaceIngestion::All - && c == '\r' { - start += 1; - ingest_whitespace = WhitespaceIngestion::ExceptNewline; - } else if ingest_whitespace == WhitespaceIngestion::Never - && c.is_whitespace() { - start += 1; - } else { - ingest_whitespace = WhitespaceIngestion::All; - } - } - None => { - terminated = false; - break; - } + Some(_) => {} + None => break, } } - let end = self.index() - (if terminated { 3 } else { 0 }); - return Code { - lang: if lang_opt.len() == 0 { None } else { - Some(Spanned::new(lang_opt, Span::new(lang_start, lang_end))) - }, + let terminated = backticks == 3; + let end = self.index() - if terminated { 3 } else { 0 }; + + Code { + lang, raw: &self.src[start..end], terminated } + } else { + Raw { raw, terminated } } - Raw { raw, terminated } } fn read_until_unescaped(&mut self, c: char) -> (&'s str, bool) { diff --git a/src/syntax/tree.rs b/src/syntax/tree.rs index 313e76a4a..44acd0234 100644 --- a/src/syntax/tree.rs +++ b/src/syntax/tree.rs @@ -33,8 +33,8 @@ pub enum SyntaxNode { Text(String), /// Lines of raw text. Raw(Vec), - /// An optionally highlighted multi-line code block. - CodeBlock(CodeBlockExpr), + /// An optionally highlighted (multi-line) code block. + Code(Code), /// A function call. Call(CallExpr), } @@ -201,9 +201,10 @@ impl CallExpr { } } } -/// An code block. +/// A code block. #[derive(Debug, Clone, PartialEq)] -pub struct CodeBlockExpr { +pub struct Code { pub lang: Option>, - pub raw: Vec, + pub lines: Vec, + pub block: bool, }