Use icu4x for linebreaking algorithm (#1355)

This commit is contained in:
Peng Guanwen 2023-05-30 23:53:10 +08:00 committed by GitHub
parent 11714609b8
commit e2bf2327b5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 331 additions and 13 deletions

254
Cargo.lock generated
View File

@ -338,6 +338,12 @@ dependencies = [
"roff", "roff",
] ]
[[package]]
name = "cobs"
version = "0.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "67ba02a97a2bd10f4b59b25c7973101c79642302776489e030cd13cdab09ed15"
[[package]] [[package]]
name = "codespan-reporting" name = "codespan-reporting"
version = "0.11.1" version = "0.11.1"
@ -803,6 +809,118 @@ dependencies = [
"cc", "cc",
] ]
[[package]]
name = "icu_collections"
version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ef8302d8dfd6044d3ddb3f807a5ef3d7bbca9a574959c6d6e4dc39aa7012d0d5"
dependencies = [
"displaydoc",
"serde",
"yoke",
"zerofrom",
"zerovec",
]
[[package]]
name = "icu_locid"
version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3003f85dccfc0e238ff567693248c59153a46f4e6125ba4020b973cef4d1d335"
dependencies = [
"displaydoc",
"litemap",
"tinystr",
"writeable",
"zerovec",
]
[[package]]
name = "icu_properties"
version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ce0e1aa26851f16c9e04412a5911c86b7f8768dac8f8d4c5f1c568a7e5d7a434"
dependencies = [
"displaydoc",
"icu_collections",
"icu_provider",
"serde",
"tinystr",
"zerovec",
]
[[package]]
name = "icu_provider"
version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8dc312a7b6148f7dfe098047ae2494d12d4034f48ade58d4f353000db376e305"
dependencies = [
"displaydoc",
"icu_locid",
"icu_provider_macros",
"postcard",
"serde",
"stable_deref_trait",
"writeable",
"yoke",
"zerofrom",
"zerovec",
]
[[package]]
name = "icu_provider_adapters"
version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f4ae1e2bd0c41728b77e7c46e9afdec5e2127d1eedacc684724667d50c126bd3"
dependencies = [
"icu_locid",
"icu_provider",
"tinystr",
"yoke",
"zerovec",
]
[[package]]
name = "icu_provider_blob"
version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fd364c9a01f791a4bc04a74cf2a1d01d9f6926a40fd5ae1c28004e1e70d8338b"
dependencies = [
"icu_provider",
"postcard",
"serde",
"writeable",
"yoke",
"zerovec",
]
[[package]]
name = "icu_provider_macros"
version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dd8b728b9421e93eff1d9f8681101b78fa745e0748c95c655c83f337044a7e10"
dependencies = [
"proc-macro2",
"quote",
"syn 1.0.109",
]
[[package]]
name = "icu_segmenter"
version = "1.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c3300a7b6bf187be98a57264ad094f11f2e062c2e8263132af010ff522ee5495"
dependencies = [
"displaydoc",
"icu_collections",
"icu_locid",
"icu_provider",
"num-traits",
"serde",
"utf8_iter",
"zerovec",
]
[[package]] [[package]]
name = "idna" name = "idna"
version = "0.3.0" version = "0.3.0"
@ -1063,6 +1181,12 @@ dependencies = [
"libdeflate-sys", "libdeflate-sys",
] ]
[[package]]
name = "libm"
version = "0.2.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f7012b1bbb0719e1097c47611d3898568c546d597c2e74d66f6087edd5233ff4"
[[package]] [[package]]
name = "linked-hash-map" name = "linked-hash-map"
version = "0.5.6" version = "0.5.6"
@ -1085,6 +1209,12 @@ dependencies = [
"rand_chacha", "rand_chacha",
] ]
[[package]]
name = "litemap"
version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3a04a5b2b6f54acba899926491d0a6c59d98012938ca2ab5befb281c034e8f94"
[[package]] [[package]]
name = "lock_api" name = "lock_api"
version = "0.4.9" version = "0.4.9"
@ -1227,6 +1357,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "578ede34cf02f8924ab9447f50c28075b4d3e5b269972345e7e0372b38c6cdcd" checksum = "578ede34cf02f8924ab9447f50c28075b4d3e5b269972345e7e0372b38c6cdcd"
dependencies = [ dependencies = [
"autocfg", "autocfg",
"libm",
] ]
[[package]] [[package]]
@ -1397,6 +1528,16 @@ dependencies = [
"miniz_oxide", "miniz_oxide",
] ]
[[package]]
name = "postcard"
version = "1.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cfa512cd0d087cc9f99ad30a1bf64795b67871edbead083ffc3a4dfafa59aa00"
dependencies = [
"cobs",
"serde",
]
[[package]] [[package]]
name = "ppv-lite86" name = "ppv-lite86"
version = "0.2.17" version = "0.2.17"
@ -1911,6 +2052,18 @@ dependencies = [
"unicode-ident", "unicode-ident",
] ]
[[package]]
name = "synstructure"
version = "0.12.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f36bdaa60a83aca3921b5259d5400cbf5e90fc51931376a9bd4a0eb79aa7210f"
dependencies = [
"proc-macro2",
"quote",
"syn 1.0.109",
"unicode-xid",
]
[[package]] [[package]]
name = "syntect" name = "syntect"
version = "5.0.0" version = "5.0.0"
@ -2056,6 +2209,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7ac3f5b6856e931e15e07b478e98c8045239829a65f9156d4fa7e7788197a5ef" checksum = "7ac3f5b6856e931e15e07b478e98c8045239829a65f9156d4fa7e7788197a5ef"
dependencies = [ dependencies = [
"displaydoc", "displaydoc",
"serde",
"zerovec",
] ]
[[package]] [[package]]
@ -2299,6 +2454,11 @@ dependencies = [
"ecow", "ecow",
"hayagriva", "hayagriva",
"hypher", "hypher",
"icu_properties",
"icu_provider",
"icu_provider_adapters",
"icu_provider_blob",
"icu_segmenter",
"kurbo", "kurbo",
"lipsum", "lipsum",
"log", "log",
@ -2319,7 +2479,6 @@ dependencies = [
"unicode-math-class", "unicode-math-class",
"unicode-script", "unicode-script",
"unicode-segmentation", "unicode-segmentation",
"xi-unicode",
] ]
[[package]] [[package]]
@ -2447,6 +2606,12 @@ version = "0.1.10"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c0edd1e5b14653f783770bce4a4dabb4a5108a5370a5f5d8cfe8710c361f6c8b" checksum = "c0edd1e5b14653f783770bce4a4dabb4a5108a5370a5f5d8cfe8710c361f6c8b"
[[package]]
name = "unicode-xid"
version = "0.2.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f962df74c8c05a667b5ee8bcf162993134c104e96440b663c8daa176dc772d8c"
[[package]] [[package]]
name = "unicode_names2" name = "unicode_names2"
version = "0.6.0" version = "0.6.0"
@ -2530,6 +2695,12 @@ dependencies = [
"svgtypes", "svgtypes",
] ]
[[package]]
name = "utf8_iter"
version = "1.0.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "64a8922555b9500e3d865caed19330172cd67cbf82203f1a3311d8c305cc9f33"
[[package]] [[package]]
name = "utf8parse" name = "utf8parse"
version = "0.2.1" version = "0.2.1"
@ -2814,6 +2985,12 @@ dependencies = [
"memchr", "memchr",
] ]
[[package]]
name = "writeable"
version = "0.5.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "60e49e42bdb1d5dc76f4cd78102f8f0714d32edfa3efb82286eb0f0b1fc0da0f"
[[package]] [[package]]
name = "wyz" name = "wyz"
version = "0.5.1" version = "0.5.1"
@ -2823,12 +3000,6 @@ dependencies = [
"tap", "tap",
] ]
[[package]]
name = "xi-unicode"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a67300977d3dc3f8034dae89778f502b6ba20b269527b3223ba59c0cf393bb8a"
[[package]] [[package]]
name = "xmlparser" name = "xmlparser"
version = "0.13.5" version = "0.13.5"
@ -2866,6 +3037,75 @@ dependencies = [
"linked-hash-map", "linked-hash-map",
] ]
[[package]]
name = "yoke"
version = "0.7.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1848075a23a28f9773498ee9a0f2cf58fcbad4f8c0ccf84a210ab33c6ae495de"
dependencies = [
"serde",
"stable_deref_trait",
"yoke-derive",
"zerofrom",
]
[[package]]
name = "yoke-derive"
version = "0.7.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "af46c169923ed7516eef0aa32b56d2651b229f57458ebe46b49ddd6efef5b7a2"
dependencies = [
"proc-macro2",
"quote",
"syn 1.0.109",
"synstructure",
]
[[package]]
name = "zerofrom"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "df54d76c3251de27615dfcce21e636c172dafb2549cd7fd93e21c66f6ca6bea2"
dependencies = [
"zerofrom-derive",
]
[[package]]
name = "zerofrom-derive"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b4eae7c1f7d4b8eafce526bc0771449ddc2f250881ae31c50d22c032b5a1c499"
dependencies = [
"proc-macro2",
"quote",
"syn 1.0.109",
"synstructure",
]
[[package]]
name = "zerovec"
version = "0.9.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "198f54134cd865f437820aa3b43d0ad518af4e68ee161b444cdd15d8e567c8ea"
dependencies = [
"serde",
"yoke",
"zerofrom",
"zerovec-derive",
]
[[package]]
name = "zerovec-derive"
version = "0.9.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "486558732d5dde10d0f8cb2936507c1bb21bc539d924c949baf5f36a58e51bac"
dependencies = [
"proc-macro2",
"quote",
"syn 1.0.109",
"synstructure",
]
[[package]] [[package]]
name = "zopfli" name = "zopfli"
version = "0.7.2" version = "0.7.2"

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -25,6 +25,11 @@ csv = "1"
ecow = "0.1" ecow = "0.1"
hayagriva = "0.3" hayagriva = "0.3"
hypher = "0.1" hypher = "0.1"
icu_properties = { version = "1.2.0", features = ["serde"] }
icu_provider = { version = "1.2.0", features = ["sync"] }
icu_provider_adapters = "1.2.0"
icu_provider_blob = "1.2.0"
icu_segmenter = { version = "1.2.1", features = ["serde"] }
kurbo = "0.9" kurbo = "0.9"
lipsum = "0.9" lipsum = "0.9"
log = "0.4" log = "0.4"
@ -44,4 +49,3 @@ unicode-bidi = "0.3.13"
unicode-math-class = "0.1" unicode-math-class = "0.1"
unicode-script = "0.5" unicode-script = "0.5"
unicode-segmentation = "1" unicode-segmentation = "1"
xi-unicode = "0.3"

View File

@ -1,7 +1,12 @@
use icu_properties::{maps::CodePointMapData, LineBreak};
use icu_provider::AsDeserializingBufferProvider;
use icu_provider_adapters::fork::ForkByKeyProvider;
use icu_provider_blob::BlobDataProvider;
use icu_segmenter::{LineBreakIteratorUtf8, LineSegmenter};
use once_cell::sync::Lazy;
use typst::eval::Tracer; use typst::eval::Tracer;
use unicode_bidi::{BidiInfo, Level as BidiLevel}; use unicode_bidi::{BidiInfo, Level as BidiLevel};
use unicode_script::{Script, UnicodeScript}; use unicode_script::{Script, UnicodeScript};
use xi_unicode::LineBreakIterator;
use super::{BoxElem, HElem, Sizing, Spacing}; use super::{BoxElem, HElem, Sizing, Spacing};
use crate::layout::AlignElem; use crate::layout::AlignElem;
@ -998,15 +1003,65 @@ fn linebreak_optimized<'a>(vt: &Vt, p: &'a Preparation<'a>, width: Abs) -> Vec<L
lines lines
} }
/// Generated by the following command:
///
/// ```sh
/// icu4x-datagen --locales full --keys-for-bin target/debug/typst \
/// --format blob --out assets/data/icudata.postcard --overwrite
/// ```
///
/// Install icu4x-datagen with `cargo install icu4x-datagen`.
static ICU_DATA: &[u8] = include_bytes!("../../../assets/data/icudata.postcard");
/// Gnerated by the following command:
///
/// ```sh
/// icu4x-datagen --locales zh ja --keys segmenter/line@1 --format blob \
/// --out assets/data/cj_linebreak_data.postcard --overwrite
/// ```
///
/// The used icu4x-datagen should be patched by
/// https://github.com/peng1999/icu4x/commit/b9beb6cbf633d61fc3d7983e5baf7f4449fbfae5
static CJ_LINEBREAK_DATA: &[u8] =
include_bytes!("../../../assets/data/cj_linebreak_data.postcard");
/// The general line break segmenter.
static SEGMENTER: Lazy<LineSegmenter> = Lazy::new(|| {
let provider = BlobDataProvider::try_new_from_static_blob(ICU_DATA).unwrap();
LineSegmenter::try_new_lstm_with_buffer_provider(&provider).unwrap()
});
/// The Unicode line break properties for each code point.
static CJ_SEGMENTER: Lazy<LineSegmenter> = Lazy::new(|| {
let provider = BlobDataProvider::try_new_from_static_blob(ICU_DATA).unwrap();
let cj_blob = BlobDataProvider::try_new_from_static_blob(CJ_LINEBREAK_DATA).unwrap();
let cj_provider = ForkByKeyProvider::new(cj_blob, provider);
LineSegmenter::try_new_lstm_with_buffer_provider(&cj_provider).unwrap()
});
/// The line break segmenter for Chinese/Jpanese text.
static LINEBREAK_DATA: Lazy<CodePointMapData<LineBreak>> = Lazy::new(|| {
let provider = BlobDataProvider::try_new_from_static_blob(ICU_DATA).unwrap();
let deser_provider = provider.as_deserializing();
icu_properties::maps::load_line_break(&deser_provider).unwrap()
});
/// Determine all possible points in the text where lines can broken. /// Determine all possible points in the text where lines can broken.
/// ///
/// Returns for each breakpoint the text index, whether the break is mandatory /// Returns for each breakpoint the text index, whether the break is mandatory
/// (after `\n`) and whether a hyphen is required (when breaking inside of a /// (after `\n`) and whether a hyphen is required (when breaking inside of a
/// word). /// word).
fn breakpoints<'a>(p: &'a Preparation<'a>) -> Breakpoints<'a> { fn breakpoints<'a>(p: &'a Preparation<'a>) -> Breakpoints<'a> {
let mut linebreaks = if matches!(p.lang, Some(Lang::CHINESE | Lang::JAPANESE)) {
CJ_SEGMENTER.segment_str(p.bidi.text)
} else {
SEGMENTER.segment_str(p.bidi.text)
};
// The iterator always yields a breakpoint at index 0, we want to ignore it
linebreaks.next();
Breakpoints { Breakpoints {
p, p,
linebreaks: LineBreakIterator::new(p.bidi.text), linebreaks,
syllables: None, syllables: None,
offset: 0, offset: 0,
suffix: 0, suffix: 0,
@ -1020,7 +1075,7 @@ struct Breakpoints<'a> {
/// The paragraph's items. /// The paragraph's items.
p: &'a Preparation<'a>, p: &'a Preparation<'a>,
/// The inner iterator over the unicode line break opportunities. /// The inner iterator over the unicode line break opportunities.
linebreaks: LineBreakIterator<'a>, linebreaks: LineBreakIteratorUtf8<'a, 'a>,
/// Iterator over syllables of the current word. /// Iterator over syllables of the current word.
syllables: Option<hypher::Syllables<'a>>, syllables: Option<hypher::Syllables<'a>>,
/// The current text offset. /// The current text offset.
@ -1054,8 +1109,20 @@ impl Iterator for Breakpoints<'_> {
return Some((self.offset, self.mandatory && !hyphen, hyphen)); return Some((self.offset, self.mandatory && !hyphen, hyphen));
} }
let lb = LINEBREAK_DATA.as_borrowed();
// Get the next "word". // Get the next "word".
(self.end, self.mandatory) = self.linebreaks.next()?; self.end = self.linebreaks.next()?;
self.mandatory =
self.p.bidi.text[..self.end].chars().next_back().map_or(false, |c| {
matches!(
lb.get(c),
LineBreak::MandatoryBreak
| LineBreak::CarriageReturn
| LineBreak::LineFeed
| LineBreak::NextLine
) || self.end == self.p.bidi.text.len()
});
// Hyphenate the next word. // Hyphenate the next word.
if self.p.hyphenate != Some(false) { if self.p.hyphenate != Some(false) {

View File

@ -524,6 +524,7 @@ impl Lang {
pub const FRENCH: Self = Self(*b"fr ", 2); pub const FRENCH: Self = Self(*b"fr ", 2);
pub const GERMAN: Self = Self(*b"de ", 2); pub const GERMAN: Self = Self(*b"de ", 2);
pub const ITALIAN: Self = Self(*b"it ", 2); pub const ITALIAN: Self = Self(*b"it ", 2);
pub const JAPANESE: Self = Self(*b"ja ", 2);
pub const NYNORSK: Self = Self(*b"nn ", 2); pub const NYNORSK: Self = Self(*b"nn ", 2);
pub const POLISH: Self = Self(*b"pl ", 2); pub const POLISH: Self = Self(*b"pl ", 2);
pub const PORTUGUESE: Self = Self(*b"pt ", 2); pub const PORTUGUESE: Self = Self(*b"pt ", 2);

Binary file not shown.

Before

Width:  |  Height:  |  Size: 21 KiB

After

Width:  |  Height:  |  Size: 38 KiB

View File

@ -43,7 +43,7 @@
#set text(font: "Noto Serif CJK SC", lang: "zh") #set text(font: "Noto Serif CJK SC", lang: "zh")
#set par(justify: true) #set par(justify: true)
孔雀最早见于《山海经》中的《海内经》:\u{200b}“有孔雀。”东汉杨孚著《异物志》记载,岭南:“孔雀,其大如大雁而足高,毛皆有斑纹彩,捕而蓄之,拍手即舞。” 孔雀最早见于《山海经》中的《海内经》:“有孔雀。”东汉杨孚著《异物志》记载,岭南:“孔雀,其大如大雁而足高,毛皆有斑纹彩,捕而蓄之,拍手即舞。”
#set text(font: "Noto Serif CJK TC", lang: "zh", region: "hk") #set text(font: "Noto Serif CJK TC", lang: "zh", region: "hk")
孔雀最早见于《山海经》中的《海内经》:「有孔雀。」东汉杨孚著《异物志》记载,岭南:「孔雀,其大如大雁而足高,毛皆有斑纹彩,捕而蓄之,拍手即舞。」 孔雀最早见于《山海经》中的《海内经》:「有孔雀。」东汉杨孚著《异物志》记载,岭南:「孔雀,其大如大雁而足高,毛皆有斑纹彩,捕而蓄之,拍手即舞。」

View File

@ -43,3 +43,9 @@ Second part
// Test comments at the end of a line with pre-spacing // Test comments at the end of a line with pre-spacing
First part // First part //
Second part Second part
---
// Test linebreak for East Asian languages
ทีวีตรวจทานนอร์ทแฟรีเลคเชอร์โกลด์อัลบัมเชอร์รี่เย้วสโตร์กฤษณ์เคลมเยอบีร่าพ่อค้าบลูเบอร์รี่สหัสวรรษโฮปแคนูโยโย่จูนสตรอว์เบอร์รีซื่อบื้อเยนแบ็กโฮเป็นไงโดนัททอมสเตริโอแคนูวิทย์แดรี่โดนัทวิทย์แอปพริคอทเซอร์ไพรส์ไฮบริดกิฟท์อินเตอร์โซนเซอร์วิสเทียมทานโคโยตี้ม็อบเที่ยงคืนบุญคุณ