Lexer change: Allow emphasis in CJK text without spaces (#2648)

This commit is contained in:
Peng Guanwen 2023-11-15 22:01:15 +08:00 committed by GitHub
parent 50ea3b4f16
commit f4a81091f7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 20 additions and 2 deletions

1
Cargo.lock generated
View File

@ -3117,6 +3117,7 @@ dependencies = [
"tracing",
"unicode-ident",
"unicode-math-class",
"unicode-script",
"unicode-segmentation",
"unscanny",
]

Binary file not shown.

Binary file not shown.

View File

@ -23,5 +23,6 @@ serde = { workspace = true }
tracing = { workspace = true }
unicode-ident = { workspace = true }
unicode-math-class = { workspace = true }
unicode-script = { workspace = true }
unicode-segmentation = { workspace = true }
unscanny = { workspace = true }

View File

@ -1,5 +1,6 @@
use ecow::{eco_format, EcoString};
use unicode_ident::{is_xid_continue, is_xid_start};
use unicode_script::{Script, UnicodeScript};
use unicode_segmentation::UnicodeSegmentation;
use unscanny::Scanner;
@ -343,10 +344,18 @@ impl Lexer<'_> {
}
fn in_word(&self) -> bool {
let alphanum = |c: Option<char>| c.map_or(false, |c| c.is_alphanumeric());
let wordy = |c: Option<char>| {
c.map_or(false, |c| {
c.is_alphanumeric()
&& !matches!(
c.script(),
Script::Han | Script::Hiragana | Script::Katakana
)
})
};
let prev = self.s.scout(-2);
let next = self.s.peek();
alphanum(prev) && alphanum(next)
wordy(prev) && wordy(next)
}
fn space_or_end(&self) -> bool {

Binary file not shown.

Before

Width:  |  Height:  |  Size: 8.4 KiB

After

Width:  |  Height:  |  Size: 17 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.4 KiB

After

Width:  |  Height:  |  Size: 1.4 KiB

View File

@ -7,6 +7,13 @@ _Emphasized and *strong* words!_
// Inside of a word it's a normal underscore or star.
hello_world Nutzer*innen
// CJK characters will not need spaces.
中文一般使用*粗体*或者_楷体_来表示强调。
日本語では、*太字*_斜体_を使って強調します。
中文中混有*Strong*_Empasis_
// Can contain paragraph in nested content block.
_Still #[