Refine linebreak algorithm for better Chinese justification (#701)

This commit is contained in:
Peng Guanwen 2023-04-13 16:44:39 +08:00 committed by GitHub
parent 03d2ec9f81
commit ff16f3fb37
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 189 additions and 65 deletions

View File

@ -457,22 +457,35 @@ impl<'a> Line<'a> {
self.items().skip(start).take(end - start)
}
/// How many justifiable glyphs the line contains.
/// How many glyphs are in the text where we can insert additional
/// space when encountering underfull lines.
fn justifiables(&self) -> usize {
let mut count = 0;
for shaped in self.items().filter_map(Item::text) {
count += shaped.justifiables();
}
// CJK character at line end should not be adjusted.
if self
.items()
.last()
.and_then(Item::text)
.map(|s| s.cjk_justifiable_at_last())
.unwrap_or(false)
{
count -= 1;
}
count
}
/// How much of the line is stretchable spaces.
fn stretch(&self) -> Abs {
let mut stretch = Abs::zero();
for shaped in self.items().filter_map(Item::text) {
stretch += shaped.stretch();
}
stretch
/// How much can the line stretch
fn stretchability(&self) -> Abs {
self.items().filter_map(Item::text).map(|s| s.stretchability()).sum()
}
/// How much can the line shrink
fn shrinkability(&self) -> Abs {
self.items().filter_map(Item::text).map(|s| s.shrinkability()).sum()
}
/// The sum of fractions in the line.
@ -835,10 +848,9 @@ fn linebreak_optimized<'a>(vt: &Vt, p: &'a Preparation<'a>, width: Abs) -> Vec<L
// Cost parameters.
const HYPH_COST: Cost = 0.5;
const CONSECUTIVE_DASH_COST: Cost = 30.0;
const CONSECUTIVE_DASH_COST: Cost = 300.0;
const MAX_COST: Cost = 1_000_000.0;
const MIN_COST: Cost = -MAX_COST;
const MIN_RATIO: f64 = -0.15;
const MIN_RATIO: f64 = -1.0;
// Dynamic programming table.
let mut active = 0;
@ -864,13 +876,30 @@ fn linebreak_optimized<'a>(vt: &Vt, p: &'a Preparation<'a>, width: Abs) -> Vec<L
// Determine how much the line's spaces would need to be stretched
// to make it the desired width.
let delta = width - attempt.width;
let mut ratio = delta / attempt.stretch();
if ratio.is_infinite() {
ratio = delta / (em / 2.0);
// Determine how much stretch are permitted.
let adjust = if delta >= Abs::zero() {
attempt.stretchability()
} else {
attempt.shrinkability()
};
// Ideally, the ratio should between -1.0 and 1.0, but sometimes a value above 1.0
// is possible, in which case the line is underfull.
let mut ratio = delta / adjust;
if ratio.is_nan() {
// The line is not stretchable, but it just fits.
// This often happens with monospace fonts and CJK texts.
ratio = 0.0;
}
if ratio.is_infinite() {
// The line's not stretchable, we calculate the ratio in another way...
ratio = delta / (em / 2.0);
// ...and because it is underfull/overfull, make sure the ratio is at least 1.0.
if ratio > 0.0 {
ratio += 1.0;
} else {
ratio -= 1.0;
}
}
// At some point, it doesn't matter any more.
ratio = ratio.min(10.0);
// Determine the cost of the line.
let min_ratio = if attempt.justify { MIN_RATIO } else { 0.0 };
@ -883,11 +912,15 @@ fn linebreak_optimized<'a>(vt: &Vt, p: &'a Preparation<'a>, width: Abs) -> Vec<L
active = i + 1;
MAX_COST
} else if mandatory || eof {
// This is a mandatory break and the line is not overfull, so it
// has minimum cost. All breakpoints before this one become
// inactive since no line can span above the mandatory break.
// This is a mandatory break and the line is not overfull, so
// all breakpoints before this one become inactive since no line
// can span above the mandatory break.
active = k;
MIN_COST + if attempt.justify { ratio.powi(3).abs() } else { 0.0 }
if attempt.justify {
ratio.powi(3).abs()
} else {
0.0
}
} else {
// Normal line with cost of |ratio^3|.
ratio.powi(3).abs()
@ -898,6 +931,12 @@ fn linebreak_optimized<'a>(vt: &Vt, p: &'a Preparation<'a>, width: Abs) -> Vec<L
cost += HYPH_COST;
}
// In Knuth paper, cost = (1 + 100|r|^3 + p)^2 + a,
// where r is the ratio, p=50 is penaty, and a=3000 is consecutive penaty.
// We divide the whole formula by 10, resulting (0.01 + |r|^3 + p)^2 + a,
// where p=0.5 and a=300
cost = (0.01 + cost).powi(2);
// Penalize two consecutive dashes (not necessarily hyphens) extra.
if attempt.dash && pred.line.dash {
cost += CONSECUTIVE_DASH_COST;
@ -1233,13 +1272,32 @@ fn commit(
}
}
// Determine how much to justify each space.
// Determine how much addtional space is needed.
// The justicication_ratio is for the first step justification,
// extra_justification is for the last step.
// For more info on multi-step justification, see Procedures for Inter-
// Character Space Expansion in W3C document Chinese Layout Requirements.
let fr = line.fr();
let mut justification = Abs::zero();
if remaining < Abs::zero() || (line.justify && fr.is_zero()) {
let mut justification_ratio = 0.0;
let mut extra_justification = Abs::zero();
let shrink = line.shrinkability();
let stretch = line.stretchability();
if remaining < Abs::zero() && shrink > Abs::zero() {
// Attempt to reduce the length of the line, using shrinkability.
justification_ratio = (remaining / shrink).max(-1.0);
remaining = (remaining + shrink).min(Abs::zero());
} else if line.justify && fr.is_zero() {
// Attempt to increase the length of the line, using stretchability.
if stretch > Abs::zero() {
justification_ratio = (remaining / stretch).min(1.0);
remaining = (remaining - stretch).max(Abs::zero());
}
let justifiables = line.justifiables();
if justifiables > 0 {
justification = remaining / justifiables as f64;
if justifiables > 0 && remaining > Abs::zero() {
// Underfull line, distribute the extra space.
extra_justification = remaining / justifiables as f64;
remaining = Abs::zero();
}
}
@ -1275,7 +1333,7 @@ fn commit(
}
}
Item::Text(shaped) => {
let frame = shaped.build(vt, justification);
let frame = shaped.build(vt, justification_ratio, extra_justification);
push(&mut offset, frame);
}
Item::Frame(frame) => {

View File

@ -70,22 +70,42 @@ impl ShapedGlyph {
}
/// Whether the glyph is justifiable.
///
/// Typst's basic justification strategy is to stretch all the spaces
/// in a line until the line fills the available width. However, some
/// scripts (notably Chinese and Japanese) don't use spaces.
///
/// In Japanese typography, the convention is to insert space evenly
/// between all glyphs. I assume it's the same in Chinese.
pub fn is_justifiable(&self) -> bool {
self.is_space() || is_spaceless(self.c.script())
self.is_space() || self.is_cjk() || self.is_cjk_punctuation()
}
}
/// Does this script separate its words using spaces?
fn is_spaceless(script: Script) -> bool {
use Script::*;
matches!(script, Hiragana | Katakana | Han)
pub fn is_cjk(&self) -> bool {
use Script::*;
matches!(self.c.script(), Hiragana | Katakana | Han)
}
pub fn is_cjk_punctuation(&self) -> bool {
matches!(self.c, '' | '。' | '、' | '' | '')
}
/// The stretchability of the character.
pub fn stretchability(&self) -> Em {
let width = self.x_advance;
if self.is_space() {
// The number for spaces is from Knuth-Plass' paper
width / 2.0
} else {
Em::zero()
}
}
/// The shrinkability of the character.
pub fn shrinkability(&self) -> Em {
let width = self.x_advance;
if self.is_space() {
// The number for spaces is from Knuth-Plass' paper
width / 3.0
} else if self.is_cjk_punctuation() {
width / 2.0
} else {
Em::zero()
}
}
}
/// A side you can go toward.
@ -101,7 +121,12 @@ impl<'a> ShapedText<'a> {
///
/// The `justification` defines how much extra advance width each
/// [justifiable glyph](ShapedGlyph::is_justifiable) will get.
pub fn build(&self, vt: &Vt, justification: Abs) -> Frame {
pub fn build(
&self,
vt: &Vt,
justification_ratio: f64,
extra_justification: Abs,
) -> Frame {
let (top, bottom) = self.measure(vt);
let size = Size::new(self.width, top + bottom);
@ -120,19 +145,25 @@ impl<'a> ShapedText<'a> {
let pos = Point::new(offset, top + shift - y_offset.at(self.size));
let glyphs = group
.iter()
.map(|glyph| Glyph {
id: glyph.glyph_id,
x_advance: glyph.x_advance
+ if glyph.is_justifiable() {
frame.size_mut().x += justification;
Em::from_length(justification, self.size)
} else {
Em::zero()
},
x_offset: glyph.x_offset,
c: glyph.c,
span: glyph.span,
offset: glyph.offset,
.map(|glyph| {
let mut justification = Em::zero();
if justification_ratio < 0.0 {
justification += glyph.shrinkability() * justification_ratio
} else {
justification += glyph.stretchability() * justification_ratio
}
if glyph.is_justifiable() {
justification += Em::from_length(extra_justification, self.size)
}
frame.size_mut().x += justification.at(self.size);
Glyph {
id: glyph.glyph_id,
x_advance: glyph.x_advance + justification,
x_offset: glyph.x_offset,
c: glyph.c,
span: glyph.span,
offset: glyph.offset,
}
})
.collect();
@ -200,17 +231,35 @@ impl<'a> ShapedText<'a> {
(top, bottom)
}
/// How many justifiable glyphs the text contains.
/// How many glyphs are in the text where we can insert additional
/// space when encountering underfull lines.
pub fn justifiables(&self) -> usize {
self.glyphs.iter().filter(|g| g.is_justifiable()).count()
}
/// The width of the spaces in the text.
pub fn stretch(&self) -> Abs {
/// Whether the last glyph is a CJK character which should not be justified
/// on line end.
pub fn cjk_justifiable_at_last(&self) -> bool {
self.glyphs
.last()
.map(|g| g.is_cjk() || g.is_cjk_punctuation())
.unwrap_or(false)
}
/// The stretchability of the text.
pub fn stretchability(&self) -> Abs {
self.glyphs
.iter()
.filter(|g| g.is_justifiable())
.map(|g| g.x_advance)
.map(|g| g.stretchability())
.sum::<Em>()
.at(self.size)
}
/// The shrinkability of the text
pub fn shrinkability(&self) -> Abs {
self.glyphs
.iter()
.map(|g| g.shrinkability())
.sum::<Em>()
.at(self.size)
}

Binary file not shown.

After

Width:  |  Height:  |  Size: 43 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 29 KiB

After

Width:  |  Height:  |  Size: 12 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 68 KiB

After

Width:  |  Height:  |  Size: 67 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 19 KiB

After

Width:  |  Height:  |  Size: 18 KiB

View File

@ -0,0 +1,23 @@
// Test Chinese text in narrow lines.
// In Chinese typography, line length should be multiples of the character size
// and the line ends should be aligned with each other.
// Most Chinese publications do not use hanging punctuation at line end.
#set page(width: auto)
#set par(justify: true)
#set text(overhang: false, lang: "zh")
#rect(inset: 0pt, width: 80pt, fill: rgb("eee"))[
中文维基百科使用汉字书写汉字是汉族或华人的共同文字是中国大陆、新加坡、马来西亚、台湾、香港、澳门的唯一官方文字或官方文字之一。25.9%而美国和荷兰则分別占13.7%及8.2%。近年來,中国大陆地区的维基百科编辑者正在迅速增加;
]
---
// Japanese typography is more complex, make sure it is at least a bit sensible.
#set page(width: auto)
#set par(justify: true)
#set text(lang: "jp")
#rect(inset: 0pt, width: 80pt, fill: rgb("eee"))[
ウィキペディア(英: Wikipediaは、世界中のボランティアの共同作業によって執筆及び作成されるフリーの多言語インターネット百科事典である。主に寄付に依って活動している非営利団体「ウィキメディア財団」が所有・運営している。
専門家によるオンライン百科事典プロジェクトNupediaヌーペディアを前身として、2001年1月、ラリー・サンガーとジミー・ウェールズ英: Jimmy Donal "Jimbo" Walesにより英語でプロジェクトが開始された。
]

View File

@ -20,12 +20,6 @@ D
A B C #linebreak(justify: true)
D E F #linebreak(justify: true)
---
// Test that justificating chinese text is at least a bit sensible.
#set page(width: 200pt)
#set par(justify: true)
中文维基百科使用汉字书写汉字是汉族或华人的共同文字是中国大陆、新加坡、马来西亚、台湾、香港、澳门的唯一官方文字或官方文字之一。25.9%而美国和荷兰则分別占13.7%及8.2%。近年來,中国大陆地区的维基百科编辑者正在迅速增加;
---
// Test that there are no hick-ups with justification enabled and
// basically empty paragraph.