Implement sophisticated CJK punctuation adjustment (#954)

This commit is contained in:
Peng Guanwen 2023-05-11 21:02:52 +08:00 committed by GitHub
parent e472b0347f
commit d19a4124de
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 195 additions and 30 deletions

Binary file not shown.

Binary file not shown.

View File

@ -8,7 +8,8 @@ use crate::layout::AlignElem;
use crate::math::EquationElem;
use crate::prelude::*;
use crate::text::{
shape, LinebreakElem, Quoter, Quotes, ShapedText, SmartQuoteElem, SpaceElem, TextElem,
is_gb_style, shape, LinebreakElem, Quoter, Quotes, ShapedText, SmartQuoteElem,
SpaceElem, TextElem,
};
/// Arrange text, spacing and inline-level elements into a paragraph.
@ -354,6 +355,13 @@ impl<'a> Item<'a> {
}
}
fn text_mut(&mut self) -> Option<&mut ShapedText<'a>> {
match self {
Self::Text(shaped) => Some(shaped),
_ => None,
}
}
/// The text length of the item.
fn len(&self) -> usize {
match self {
@ -715,9 +723,12 @@ fn shape_range<'a>(
spans: &SpanMapper,
styles: StyleChain<'a>,
) {
let lang = TextElem::lang_in(styles);
let region = TextElem::region_in(styles);
let mut process = |range: Range, level: BidiLevel| {
let dir = if level.is_ltr() { Dir::LTR } else { Dir::RTL };
let shaped = shape(vt, range.start, &bidi.text[range], spans, styles, dir);
let shaped =
shape(vt, range.start, &bidi.text[range], spans, styles, dir, lang, region);
items.push(Item::Text(shaped));
};
@ -905,15 +916,11 @@ fn linebreak_optimized<'a>(vt: &Vt, p: &'a Preparation<'a>, width: Abs) -> Vec<L
// This often happens with monospace fonts and CJK texts.
ratio = 0.0;
}
if ratio.is_infinite() {
// The line's not stretchable, we calculate the ratio in another way...
ratio = delta / (em / 2.0);
// ...and because it is underfull/overfull, make sure the ratio is at least 1.0.
if ratio > 0.0 {
ratio += 1.0;
} else {
ratio -= 1.0;
}
if ratio > 1.0 {
// We should stretch the line above its stretchability. Now calculate the extra amount.
let extra_stretch = (delta - adjust) / attempt.justifiables() as f64;
// Normalize the amount by half Em size.
ratio = 1.0 + extra_stretch / (em / 2.0);
}
// Determine the cost of the line.
@ -1124,7 +1131,9 @@ fn line<'a>(
let base = expanded.end - shaped.text.len();
let start = range.start.max(base);
let text = &p.bidi.text[start..range.end];
let trimmed = text.trim_end();
// U+200B ZERO WIDTH SPACE is used to provide a line break opportunity,
// we want to trim it too.
let trimmed = text.trim_end().trim_end_matches('\u{200B}');
range.end = start + trimmed.len();
// Deal with hyphens, dashes and justification.
@ -1132,6 +1141,11 @@ fn line<'a>(
dash = hyphen || shy || trimmed.ends_with(['-', '', '—']);
justify |= text.ends_with('\u{2028}');
// Deal with CJK punctuation at line ends.
let gb_style = is_gb_style(shaped.lang, shaped.region);
let end_cjk_punct = trimmed
.ends_with(['”', '', '', '。', '、', '', '', '》', '', '』', '」']);
// Usually, we don't want to shape an empty string because:
// - We don't want the height of trimmed whitespace in a different
// font to be considered for the line height.
@ -1141,12 +1155,21 @@ fn line<'a>(
// need the shaped empty string to make the line the appropriate
// height. That is the case exactly if the string is empty and there
// are no other items in the line.
if hyphen || start + shaped.text.len() > range.end {
if hyphen || start + shaped.text.len() > range.end || end_cjk_punct {
if hyphen || start < range.end || before.is_empty() {
let mut reshaped = shaped.reshape(vt, &p.spans, start..range.end);
if hyphen || shy {
reshaped.push_hyphen(vt);
}
let punct = reshaped.glyphs.last();
if let Some(punct) = punct {
if punct.is_cjk_left_aligned_punctuation(gb_style) {
let shrink_amount = punct.shrinkability().1;
let punct = reshaped.glyphs.to_mut().last_mut().unwrap();
punct.shrink_right(shrink_amount);
reshaped.width -= shrink_amount.at(reshaped.size);
}
}
width += reshaped.width;
last = Some(Item::Text(reshaped));
}
@ -1155,6 +1178,10 @@ fn line<'a>(
}
}
// Deal with CJK punctuation at line starts.
let text = &p.bidi.text[range.start..end];
let start_cjk_punct = text.starts_with(['“', '', '《', '', '『', '「']);
// Reshape the start item if it's split in half.
let mut first = None;
if let Some((Item::Text(shaped), after)) = inner.split_first() {
@ -1163,8 +1190,8 @@ fn line<'a>(
let end = range.end.min(base + shaped.text.len());
// Reshape if necessary.
if range.start + shaped.text.len() > end {
if range.start < end {
if range.start + shaped.text.len() > end || start_cjk_punct {
if range.start < end || start_cjk_punct {
let reshaped = shaped.reshape(vt, &p.spans, range.start..end);
width += reshaped.width;
first = Some(Item::Text(reshaped));
@ -1174,6 +1201,22 @@ fn line<'a>(
}
}
if start_cjk_punct {
let reshaped = first.as_mut().or(last.as_mut()).and_then(Item::text_mut);
if let Some(reshaped) = reshaped {
if let Some(punct) = reshaped.glyphs.first() {
if punct.is_cjk_right_aligned_punctuation() {
let shrink_amount = punct.shrinkability().0;
let punct = reshaped.glyphs.to_mut().first_mut().unwrap();
punct.shrink_left(shrink_amount);
let amount_abs = shrink_amount.at(reshaped.size);
reshaped.width -= amount_abs;
width -= amount_abs;
}
}
}
}
// Measure the inner items.
for item in inner {
width += item.width();

View File

@ -23,6 +23,10 @@ pub struct ShapedText<'a> {
pub text: &'a str,
/// The text direction.
pub dir: Dir,
/// The text language.
pub lang: Lang,
/// The text region.
pub region: Option<Region>,
/// The text's style properties.
pub styles: StyleChain<'a>,
/// The font variant.
@ -48,6 +52,8 @@ pub struct ShapedGlyph {
pub x_offset: Em,
/// The vertical offset of the glyph.
pub y_offset: Em,
/// The adjustability of the glyph.
pub adjustability: Adjustability,
/// The byte range of this glyph's cluster in the full paragraph. A cluster
/// is a sequence of one or multiple glyphs that cannot be separated and
/// must always be treated as a union.
@ -78,41 +84,67 @@ impl ShapedGlyph {
/// Whether the glyph is justifiable.
pub fn is_justifiable(&self) -> bool {
// GB style is not relevant here.
self.is_space()
|| self.is_cjk()
|| self.is_cjk_left_aligned_punctuation()
|| self.is_cjk_script()
|| self.is_cjk_left_aligned_punctuation(true)
|| self.is_cjk_right_aligned_punctuation()
|| self.is_cjk_center_aligned_punctuation(true)
}
pub fn is_cjk(&self) -> bool {
pub fn is_cjk_script(&self) -> bool {
use Script::*;
// U+30FC: Katakana-Hiragana Prolonged Sound Mark
matches!(self.c.script(), Hiragana | Katakana | Han) || self.c == '\u{30FC}'
}
pub fn is_cjk_adjustable(&self) -> bool {
self.is_cjk_left_aligned_punctuation(true)
|| self.is_cjk_right_aligned_punctuation()
|| self.is_cjk_center_aligned_punctuation(true)
}
/// See <https://www.w3.org/TR/clreq/#punctuation_width_adjustment>
pub fn is_cjk_left_aligned_punctuation(&self) -> bool {
pub fn is_cjk_left_aligned_punctuation(&self, gb_style: bool) -> bool {
// CJK quotation marks shares codepoints with latin quotation marks.
// But only the CJK ones have full width.
if matches!(self.c, '”' | '') && self.x_advance == Em::one() {
if matches!(self.c, '”' | '')
&& self.x_advance + self.stretchability().1 == Em::one()
{
return true;
}
matches!(self.c, '' | '。' | '、' | '' | '' | '》' | '' | '』' | '」')
if gb_style && matches!(self.c, '' | '。' | '、' | '' | '') {
return true;
}
matches!(self.c, '》' | '' | '』' | '」')
}
/// See <https://www.w3.org/TR/clreq/#punctuation_width_adjustment>
pub fn is_cjk_right_aligned_punctuation(&self) -> bool {
// CJK quotation marks shares codepoints with latin quotation marks.
// But only the CJK ones have full width.
if matches!(self.c, '“' | '') && self.x_advance == Em::one() {
if matches!(self.c, '“' | '')
&& self.x_advance + self.stretchability().0 == Em::one()
{
return true;
}
matches!(self.c, '《' | '' | '『' | '「')
}
pub fn adjustability(&self) -> Adjustability {
/// See https://www.w3.org/TR/clreq/#punctuation_width_adjustment
pub fn is_cjk_center_aligned_punctuation(&self, gb_style: bool) -> bool {
if !gb_style && matches!(self.c, '' | '。' | '、' | '' | '') {
return true;
}
// U+30FB: Katakana Middle Dot
matches!(self.c, '\u{30FB}')
}
pub fn base_adjustability(&self, gb_style: bool) -> Adjustability {
let width = self.x_advance;
if self.is_space() {
Adjustability {
@ -120,7 +152,7 @@ impl ShapedGlyph {
stretchability: (Em::zero(), width / 2.0),
shrinkability: (Em::zero(), width / 3.0),
}
} else if self.is_cjk_left_aligned_punctuation() {
} else if self.is_cjk_left_aligned_punctuation(gb_style) {
Adjustability {
stretchability: (Em::zero(), Em::zero()),
shrinkability: (Em::zero(), width / 2.0),
@ -130,6 +162,11 @@ impl ShapedGlyph {
stretchability: (Em::zero(), Em::zero()),
shrinkability: (width / 2.0, Em::zero()),
}
} else if self.is_cjk_center_aligned_punctuation(gb_style) {
Adjustability {
stretchability: (Em::zero(), Em::zero()),
shrinkability: (width / 4.0, width / 4.0),
}
} else {
Adjustability::default()
}
@ -137,12 +174,27 @@ impl ShapedGlyph {
/// The stretchability of the character.
pub fn stretchability(&self) -> (Em, Em) {
self.adjustability().stretchability
self.adjustability.stretchability
}
/// The shrinkability of the character.
pub fn shrinkability(&self) -> (Em, Em) {
self.adjustability().shrinkability
self.adjustability.shrinkability
}
/// Shrink the width of glyph on the left side.
pub fn shrink_left(&mut self, amount: Em) {
self.x_offset -= amount;
self.x_advance -= amount;
self.adjustability.shrinkability.0 -= amount;
self.adjustability.stretchability.0 += amount;
}
/// Shrink the width of glyph on the right side.
pub fn shrink_right(&mut self, amount: Em) {
self.x_advance -= amount;
self.adjustability.shrinkability.1 -= amount;
self.adjustability.stretchability.1 += amount;
}
}
@ -301,7 +353,7 @@ impl<'a> ShapedText<'a> {
pub fn cjk_justifiable_at_last(&self) -> bool {
self.glyphs
.last()
.map(|g| g.is_cjk() || g.is_cjk_left_aligned_punctuation())
.map(|g| g.is_cjk_script() || g.is_cjk_adjustable())
.unwrap_or(false)
}
@ -339,6 +391,8 @@ impl<'a> ShapedText<'a> {
base: text_range.start,
text,
dir: self.dir,
lang: self.lang,
region: self.region,
styles: self.styles,
size: self.size,
variant: self.variant,
@ -346,7 +400,16 @@ impl<'a> ShapedText<'a> {
glyphs: Cow::Borrowed(glyphs),
}
} else {
shape(vt, text_range.start, text, spans, self.styles, self.dir)
shape(
vt,
text_range.start,
text,
spans,
self.styles,
self.dir,
self.lang,
self.region,
)
}
}
@ -373,6 +436,7 @@ impl<'a> ShapedText<'a> {
x_advance,
x_offset: Em::zero(),
y_offset: Em::zero(),
adjustability: Adjustability::default(),
range,
safe_to_break: true,
c: '-',
@ -462,6 +526,7 @@ struct ShapingContext<'a, 'v> {
}
/// Shape text into [`ShapedText`].
#[allow(clippy::too_many_arguments)]
pub fn shape<'a>(
vt: &Vt,
base: usize,
@ -469,6 +534,8 @@ pub fn shape<'a>(
spans: &SpanMapper,
styles: StyleChain<'a>,
dir: Dir,
lang: Lang,
region: Option<Region>,
) -> ShapedText<'a> {
let size = TextElem::size_in(styles);
let mut ctx = ShapingContext {
@ -489,11 +556,14 @@ pub fn shape<'a>(
}
track_and_space(&mut ctx);
calculate_adjustability(&mut ctx, lang, region);
ShapedText {
base,
text,
dir,
lang,
region,
styles,
variant: ctx.variant,
size,
@ -581,6 +651,7 @@ fn shape_segment(
x_advance: font.to_em(pos[i].x_advance),
x_offset: font.to_em(pos[i].x_offset),
y_offset: font.to_em(pos[i].y_offset),
adjustability: Adjustability::default(),
range: start..end,
safe_to_break: !info.unsafe_to_break(),
c: text[cluster..].chars().next().unwrap(),
@ -645,6 +716,7 @@ fn shape_tofus(ctx: &mut ShapingContext, base: usize, text: &str, font: Font) {
x_advance,
x_offset: Em::zero(),
y_offset: Em::zero(),
adjustability: Adjustability::default(),
range: start..end,
safe_to_break: true,
c,
@ -679,6 +751,43 @@ fn track_and_space(ctx: &mut ShapingContext) {
}
}
pub fn is_gb_style(lang: Lang, region: Option<Region>) -> bool {
// Most CJK variants, including zh-CN, ja-JP, zh-SG, zh-MY use GB-style punctuation,
// while zh-HK and zh-TW use alternative style. We default to use GB-style.
!(lang == Lang::CHINESE
&& matches!(region.as_ref().map(Region::as_str), Some("TW" | "HK")))
}
/// Calculate stretchability and shrinkability of each glyph,
/// and CJK punctuation adjustments according to Chinese Layout Requirements.
fn calculate_adjustability(ctx: &mut ShapingContext, lang: Lang, region: Option<Region>) {
let gb_style = is_gb_style(lang, region);
let mut glyphs = ctx.glyphs.iter_mut().peekable();
while let Some(glyph) = glyphs.next() {
glyph.adjustability = glyph.base_adjustability(gb_style);
// Only GB style needs further adjustment.
if glyph.is_cjk_adjustable() && !gb_style {
continue;
}
// Now we apply consecutive punctuation adjustment, specified in Chinese Layout
// Requirements, section 3.1.6.1 Punctuation Adjustment Space, and Japanese Layout
// Requirements, section 3.1 Line Composition Rules for Punctuation Marks
let Some(next) = glyphs.peek_mut() else { continue };
let width = glyph.x_advance;
let delta = width / 2.0;
if next.is_cjk_adjustable()
&& (glyph.shrinkability().1 + next.shrinkability().0) >= delta
{
let left_delta = glyph.shrinkability().1.min(delta);
glyph.shrink_right(left_delta);
next.shrink_left(delta - left_delta);
}
}
}
/// Difference between non-breaking and normal space.
fn nbsp_delta(font: &Font) -> Option<Em> {
let space = font.ttf().glyph_index(' ')?.0;

Binary file not shown.

Before

Width:  |  Height:  |  Size: 118 KiB

After

Width:  |  Height:  |  Size: 68 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 36 KiB

After

Width:  |  Height:  |  Size: 35 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 21 KiB

After

Width:  |  Height:  |  Size: 20 KiB

View File

@ -5,7 +5,7 @@
// Most Chinese publications do not use hanging punctuation at line end.
#set page(width: auto)
#set par(justify: true)
#set text(overhang: false, lang: "zh")
#set text(font: "Noto Serif CJK SC", lang: "zh", overhang: false)
#rect(inset: 0pt, width: 80pt, fill: rgb("eee"))[
中文维基百科使用汉字书写汉字是汉族或华人的共同文字是中国大陆、新加坡、马来西亚、台湾、香港、澳门的唯一官方文字或官方文字之一。25.9%而美国和荷兰则分別占13.7%及8.2%。近年來,中国大陆地区的维基百科编辑者正在迅速增加;
@ -33,4 +33,17 @@
《书名》《测试》下一行
《书名》《测试》。
]
]
---
// Test Variants of Mainland China, Hong Kong, and Japan.
// 17 characters a line.
#set page(width: 170pt + 10pt, margin: (x: 5pt))
#set text(font: "Noto Serif CJK SC", lang: "zh", overhang: false)
#set par(justify: true)
孔雀最早见于《山海经》中的《海内经》:\u{200b}“有孔雀。”东汉杨孚著《异物志》记载,岭南:“孔雀,其大如大雁而足高,毛皆有斑纹彩,捕而蓄之,拍手即舞。”
#set text(font: "Noto Serif CJK TC", lang: "zh", region: "hk")
孔雀最早见于《山海经》中的《海内经》:「有孔雀。」东汉杨孚著《异物志》记载,岭南:「孔雀,其大如大雁而足高,毛皆有斑纹彩,捕而蓄之,拍手即舞。」