Implement sophisticated CJK punctuation adjustment (#954)
This commit is contained in:
parent
e472b0347f
commit
d19a4124de
BIN
assets/fonts/NotoSerifCJKjp-Regular.otf
Normal file
BIN
assets/fonts/NotoSerifCJKjp-Regular.otf
Normal file
Binary file not shown.
Binary file not shown.
BIN
assets/fonts/NotoSerifCJKtc-Regular.otf
Normal file
BIN
assets/fonts/NotoSerifCJKtc-Regular.otf
Normal file
Binary file not shown.
@ -8,7 +8,8 @@ use crate::layout::AlignElem;
|
||||
use crate::math::EquationElem;
|
||||
use crate::prelude::*;
|
||||
use crate::text::{
|
||||
shape, LinebreakElem, Quoter, Quotes, ShapedText, SmartQuoteElem, SpaceElem, TextElem,
|
||||
is_gb_style, shape, LinebreakElem, Quoter, Quotes, ShapedText, SmartQuoteElem,
|
||||
SpaceElem, TextElem,
|
||||
};
|
||||
|
||||
/// Arrange text, spacing and inline-level elements into a paragraph.
|
||||
@ -354,6 +355,13 @@ impl<'a> Item<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
fn text_mut(&mut self) -> Option<&mut ShapedText<'a>> {
|
||||
match self {
|
||||
Self::Text(shaped) => Some(shaped),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
/// The text length of the item.
|
||||
fn len(&self) -> usize {
|
||||
match self {
|
||||
@ -715,9 +723,12 @@ fn shape_range<'a>(
|
||||
spans: &SpanMapper,
|
||||
styles: StyleChain<'a>,
|
||||
) {
|
||||
let lang = TextElem::lang_in(styles);
|
||||
let region = TextElem::region_in(styles);
|
||||
let mut process = |range: Range, level: BidiLevel| {
|
||||
let dir = if level.is_ltr() { Dir::LTR } else { Dir::RTL };
|
||||
let shaped = shape(vt, range.start, &bidi.text[range], spans, styles, dir);
|
||||
let shaped =
|
||||
shape(vt, range.start, &bidi.text[range], spans, styles, dir, lang, region);
|
||||
items.push(Item::Text(shaped));
|
||||
};
|
||||
|
||||
@ -905,15 +916,11 @@ fn linebreak_optimized<'a>(vt: &Vt, p: &'a Preparation<'a>, width: Abs) -> Vec<L
|
||||
// This often happens with monospace fonts and CJK texts.
|
||||
ratio = 0.0;
|
||||
}
|
||||
if ratio.is_infinite() {
|
||||
// The line's not stretchable, we calculate the ratio in another way...
|
||||
ratio = delta / (em / 2.0);
|
||||
// ...and because it is underfull/overfull, make sure the ratio is at least 1.0.
|
||||
if ratio > 0.0 {
|
||||
ratio += 1.0;
|
||||
} else {
|
||||
ratio -= 1.0;
|
||||
}
|
||||
if ratio > 1.0 {
|
||||
// We should stretch the line above its stretchability. Now calculate the extra amount.
|
||||
let extra_stretch = (delta - adjust) / attempt.justifiables() as f64;
|
||||
// Normalize the amount by half Em size.
|
||||
ratio = 1.0 + extra_stretch / (em / 2.0);
|
||||
}
|
||||
|
||||
// Determine the cost of the line.
|
||||
@ -1124,7 +1131,9 @@ fn line<'a>(
|
||||
let base = expanded.end - shaped.text.len();
|
||||
let start = range.start.max(base);
|
||||
let text = &p.bidi.text[start..range.end];
|
||||
let trimmed = text.trim_end();
|
||||
// U+200B ZERO WIDTH SPACE is used to provide a line break opportunity,
|
||||
// we want to trim it too.
|
||||
let trimmed = text.trim_end().trim_end_matches('\u{200B}');
|
||||
range.end = start + trimmed.len();
|
||||
|
||||
// Deal with hyphens, dashes and justification.
|
||||
@ -1132,6 +1141,11 @@ fn line<'a>(
|
||||
dash = hyphen || shy || trimmed.ends_with(['-', '–', '—']);
|
||||
justify |= text.ends_with('\u{2028}');
|
||||
|
||||
// Deal with CJK punctuation at line ends.
|
||||
let gb_style = is_gb_style(shaped.lang, shaped.region);
|
||||
let end_cjk_punct = trimmed
|
||||
.ends_with(['”', '’', ',', '。', '、', ':', ';', '》', ')', '』', '」']);
|
||||
|
||||
// Usually, we don't want to shape an empty string because:
|
||||
// - We don't want the height of trimmed whitespace in a different
|
||||
// font to be considered for the line height.
|
||||
@ -1141,12 +1155,21 @@ fn line<'a>(
|
||||
// need the shaped empty string to make the line the appropriate
|
||||
// height. That is the case exactly if the string is empty and there
|
||||
// are no other items in the line.
|
||||
if hyphen || start + shaped.text.len() > range.end {
|
||||
if hyphen || start + shaped.text.len() > range.end || end_cjk_punct {
|
||||
if hyphen || start < range.end || before.is_empty() {
|
||||
let mut reshaped = shaped.reshape(vt, &p.spans, start..range.end);
|
||||
if hyphen || shy {
|
||||
reshaped.push_hyphen(vt);
|
||||
}
|
||||
let punct = reshaped.glyphs.last();
|
||||
if let Some(punct) = punct {
|
||||
if punct.is_cjk_left_aligned_punctuation(gb_style) {
|
||||
let shrink_amount = punct.shrinkability().1;
|
||||
let punct = reshaped.glyphs.to_mut().last_mut().unwrap();
|
||||
punct.shrink_right(shrink_amount);
|
||||
reshaped.width -= shrink_amount.at(reshaped.size);
|
||||
}
|
||||
}
|
||||
width += reshaped.width;
|
||||
last = Some(Item::Text(reshaped));
|
||||
}
|
||||
@ -1155,6 +1178,10 @@ fn line<'a>(
|
||||
}
|
||||
}
|
||||
|
||||
// Deal with CJK punctuation at line starts.
|
||||
let text = &p.bidi.text[range.start..end];
|
||||
let start_cjk_punct = text.starts_with(['“', '‘', '《', '(', '『', '「']);
|
||||
|
||||
// Reshape the start item if it's split in half.
|
||||
let mut first = None;
|
||||
if let Some((Item::Text(shaped), after)) = inner.split_first() {
|
||||
@ -1163,8 +1190,8 @@ fn line<'a>(
|
||||
let end = range.end.min(base + shaped.text.len());
|
||||
|
||||
// Reshape if necessary.
|
||||
if range.start + shaped.text.len() > end {
|
||||
if range.start < end {
|
||||
if range.start + shaped.text.len() > end || start_cjk_punct {
|
||||
if range.start < end || start_cjk_punct {
|
||||
let reshaped = shaped.reshape(vt, &p.spans, range.start..end);
|
||||
width += reshaped.width;
|
||||
first = Some(Item::Text(reshaped));
|
||||
@ -1174,6 +1201,22 @@ fn line<'a>(
|
||||
}
|
||||
}
|
||||
|
||||
if start_cjk_punct {
|
||||
let reshaped = first.as_mut().or(last.as_mut()).and_then(Item::text_mut);
|
||||
if let Some(reshaped) = reshaped {
|
||||
if let Some(punct) = reshaped.glyphs.first() {
|
||||
if punct.is_cjk_right_aligned_punctuation() {
|
||||
let shrink_amount = punct.shrinkability().0;
|
||||
let punct = reshaped.glyphs.to_mut().first_mut().unwrap();
|
||||
punct.shrink_left(shrink_amount);
|
||||
let amount_abs = shrink_amount.at(reshaped.size);
|
||||
reshaped.width -= amount_abs;
|
||||
width -= amount_abs;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Measure the inner items.
|
||||
for item in inner {
|
||||
width += item.width();
|
||||
|
@ -23,6 +23,10 @@ pub struct ShapedText<'a> {
|
||||
pub text: &'a str,
|
||||
/// The text direction.
|
||||
pub dir: Dir,
|
||||
/// The text language.
|
||||
pub lang: Lang,
|
||||
/// The text region.
|
||||
pub region: Option<Region>,
|
||||
/// The text's style properties.
|
||||
pub styles: StyleChain<'a>,
|
||||
/// The font variant.
|
||||
@ -48,6 +52,8 @@ pub struct ShapedGlyph {
|
||||
pub x_offset: Em,
|
||||
/// The vertical offset of the glyph.
|
||||
pub y_offset: Em,
|
||||
/// The adjustability of the glyph.
|
||||
pub adjustability: Adjustability,
|
||||
/// The byte range of this glyph's cluster in the full paragraph. A cluster
|
||||
/// is a sequence of one or multiple glyphs that cannot be separated and
|
||||
/// must always be treated as a union.
|
||||
@ -78,41 +84,67 @@ impl ShapedGlyph {
|
||||
|
||||
/// Whether the glyph is justifiable.
|
||||
pub fn is_justifiable(&self) -> bool {
|
||||
// GB style is not relevant here.
|
||||
self.is_space()
|
||||
|| self.is_cjk()
|
||||
|| self.is_cjk_left_aligned_punctuation()
|
||||
|| self.is_cjk_script()
|
||||
|| self.is_cjk_left_aligned_punctuation(true)
|
||||
|| self.is_cjk_right_aligned_punctuation()
|
||||
|| self.is_cjk_center_aligned_punctuation(true)
|
||||
}
|
||||
|
||||
pub fn is_cjk(&self) -> bool {
|
||||
pub fn is_cjk_script(&self) -> bool {
|
||||
use Script::*;
|
||||
// U+30FC: Katakana-Hiragana Prolonged Sound Mark
|
||||
matches!(self.c.script(), Hiragana | Katakana | Han) || self.c == '\u{30FC}'
|
||||
}
|
||||
|
||||
pub fn is_cjk_adjustable(&self) -> bool {
|
||||
self.is_cjk_left_aligned_punctuation(true)
|
||||
|| self.is_cjk_right_aligned_punctuation()
|
||||
|| self.is_cjk_center_aligned_punctuation(true)
|
||||
}
|
||||
|
||||
/// See <https://www.w3.org/TR/clreq/#punctuation_width_adjustment>
|
||||
pub fn is_cjk_left_aligned_punctuation(&self) -> bool {
|
||||
pub fn is_cjk_left_aligned_punctuation(&self, gb_style: bool) -> bool {
|
||||
// CJK quotation marks shares codepoints with latin quotation marks.
|
||||
// But only the CJK ones have full width.
|
||||
if matches!(self.c, '”' | '’') && self.x_advance == Em::one() {
|
||||
if matches!(self.c, '”' | '’')
|
||||
&& self.x_advance + self.stretchability().1 == Em::one()
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
matches!(self.c, ',' | '。' | '、' | ':' | ';' | '》' | ')' | '』' | '」')
|
||||
if gb_style && matches!(self.c, ',' | '。' | '、' | ':' | ';') {
|
||||
return true;
|
||||
}
|
||||
|
||||
matches!(self.c, '》' | ')' | '』' | '」')
|
||||
}
|
||||
|
||||
/// See <https://www.w3.org/TR/clreq/#punctuation_width_adjustment>
|
||||
pub fn is_cjk_right_aligned_punctuation(&self) -> bool {
|
||||
// CJK quotation marks shares codepoints with latin quotation marks.
|
||||
// But only the CJK ones have full width.
|
||||
if matches!(self.c, '“' | '‘') && self.x_advance == Em::one() {
|
||||
if matches!(self.c, '“' | '‘')
|
||||
&& self.x_advance + self.stretchability().0 == Em::one()
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
matches!(self.c, '《' | '(' | '『' | '「')
|
||||
}
|
||||
|
||||
pub fn adjustability(&self) -> Adjustability {
|
||||
/// See https://www.w3.org/TR/clreq/#punctuation_width_adjustment
|
||||
pub fn is_cjk_center_aligned_punctuation(&self, gb_style: bool) -> bool {
|
||||
if !gb_style && matches!(self.c, ',' | '。' | '、' | ':' | ';') {
|
||||
return true;
|
||||
}
|
||||
|
||||
// U+30FB: Katakana Middle Dot
|
||||
matches!(self.c, '\u{30FB}')
|
||||
}
|
||||
|
||||
pub fn base_adjustability(&self, gb_style: bool) -> Adjustability {
|
||||
let width = self.x_advance;
|
||||
if self.is_space() {
|
||||
Adjustability {
|
||||
@ -120,7 +152,7 @@ impl ShapedGlyph {
|
||||
stretchability: (Em::zero(), width / 2.0),
|
||||
shrinkability: (Em::zero(), width / 3.0),
|
||||
}
|
||||
} else if self.is_cjk_left_aligned_punctuation() {
|
||||
} else if self.is_cjk_left_aligned_punctuation(gb_style) {
|
||||
Adjustability {
|
||||
stretchability: (Em::zero(), Em::zero()),
|
||||
shrinkability: (Em::zero(), width / 2.0),
|
||||
@ -130,6 +162,11 @@ impl ShapedGlyph {
|
||||
stretchability: (Em::zero(), Em::zero()),
|
||||
shrinkability: (width / 2.0, Em::zero()),
|
||||
}
|
||||
} else if self.is_cjk_center_aligned_punctuation(gb_style) {
|
||||
Adjustability {
|
||||
stretchability: (Em::zero(), Em::zero()),
|
||||
shrinkability: (width / 4.0, width / 4.0),
|
||||
}
|
||||
} else {
|
||||
Adjustability::default()
|
||||
}
|
||||
@ -137,12 +174,27 @@ impl ShapedGlyph {
|
||||
|
||||
/// The stretchability of the character.
|
||||
pub fn stretchability(&self) -> (Em, Em) {
|
||||
self.adjustability().stretchability
|
||||
self.adjustability.stretchability
|
||||
}
|
||||
|
||||
/// The shrinkability of the character.
|
||||
pub fn shrinkability(&self) -> (Em, Em) {
|
||||
self.adjustability().shrinkability
|
||||
self.adjustability.shrinkability
|
||||
}
|
||||
|
||||
/// Shrink the width of glyph on the left side.
|
||||
pub fn shrink_left(&mut self, amount: Em) {
|
||||
self.x_offset -= amount;
|
||||
self.x_advance -= amount;
|
||||
self.adjustability.shrinkability.0 -= amount;
|
||||
self.adjustability.stretchability.0 += amount;
|
||||
}
|
||||
|
||||
/// Shrink the width of glyph on the right side.
|
||||
pub fn shrink_right(&mut self, amount: Em) {
|
||||
self.x_advance -= amount;
|
||||
self.adjustability.shrinkability.1 -= amount;
|
||||
self.adjustability.stretchability.1 += amount;
|
||||
}
|
||||
}
|
||||
|
||||
@ -301,7 +353,7 @@ impl<'a> ShapedText<'a> {
|
||||
pub fn cjk_justifiable_at_last(&self) -> bool {
|
||||
self.glyphs
|
||||
.last()
|
||||
.map(|g| g.is_cjk() || g.is_cjk_left_aligned_punctuation())
|
||||
.map(|g| g.is_cjk_script() || g.is_cjk_adjustable())
|
||||
.unwrap_or(false)
|
||||
}
|
||||
|
||||
@ -339,6 +391,8 @@ impl<'a> ShapedText<'a> {
|
||||
base: text_range.start,
|
||||
text,
|
||||
dir: self.dir,
|
||||
lang: self.lang,
|
||||
region: self.region,
|
||||
styles: self.styles,
|
||||
size: self.size,
|
||||
variant: self.variant,
|
||||
@ -346,7 +400,16 @@ impl<'a> ShapedText<'a> {
|
||||
glyphs: Cow::Borrowed(glyphs),
|
||||
}
|
||||
} else {
|
||||
shape(vt, text_range.start, text, spans, self.styles, self.dir)
|
||||
shape(
|
||||
vt,
|
||||
text_range.start,
|
||||
text,
|
||||
spans,
|
||||
self.styles,
|
||||
self.dir,
|
||||
self.lang,
|
||||
self.region,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
@ -373,6 +436,7 @@ impl<'a> ShapedText<'a> {
|
||||
x_advance,
|
||||
x_offset: Em::zero(),
|
||||
y_offset: Em::zero(),
|
||||
adjustability: Adjustability::default(),
|
||||
range,
|
||||
safe_to_break: true,
|
||||
c: '-',
|
||||
@ -462,6 +526,7 @@ struct ShapingContext<'a, 'v> {
|
||||
}
|
||||
|
||||
/// Shape text into [`ShapedText`].
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub fn shape<'a>(
|
||||
vt: &Vt,
|
||||
base: usize,
|
||||
@ -469,6 +534,8 @@ pub fn shape<'a>(
|
||||
spans: &SpanMapper,
|
||||
styles: StyleChain<'a>,
|
||||
dir: Dir,
|
||||
lang: Lang,
|
||||
region: Option<Region>,
|
||||
) -> ShapedText<'a> {
|
||||
let size = TextElem::size_in(styles);
|
||||
let mut ctx = ShapingContext {
|
||||
@ -489,11 +556,14 @@ pub fn shape<'a>(
|
||||
}
|
||||
|
||||
track_and_space(&mut ctx);
|
||||
calculate_adjustability(&mut ctx, lang, region);
|
||||
|
||||
ShapedText {
|
||||
base,
|
||||
text,
|
||||
dir,
|
||||
lang,
|
||||
region,
|
||||
styles,
|
||||
variant: ctx.variant,
|
||||
size,
|
||||
@ -581,6 +651,7 @@ fn shape_segment(
|
||||
x_advance: font.to_em(pos[i].x_advance),
|
||||
x_offset: font.to_em(pos[i].x_offset),
|
||||
y_offset: font.to_em(pos[i].y_offset),
|
||||
adjustability: Adjustability::default(),
|
||||
range: start..end,
|
||||
safe_to_break: !info.unsafe_to_break(),
|
||||
c: text[cluster..].chars().next().unwrap(),
|
||||
@ -645,6 +716,7 @@ fn shape_tofus(ctx: &mut ShapingContext, base: usize, text: &str, font: Font) {
|
||||
x_advance,
|
||||
x_offset: Em::zero(),
|
||||
y_offset: Em::zero(),
|
||||
adjustability: Adjustability::default(),
|
||||
range: start..end,
|
||||
safe_to_break: true,
|
||||
c,
|
||||
@ -679,6 +751,43 @@ fn track_and_space(ctx: &mut ShapingContext) {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn is_gb_style(lang: Lang, region: Option<Region>) -> bool {
|
||||
// Most CJK variants, including zh-CN, ja-JP, zh-SG, zh-MY use GB-style punctuation,
|
||||
// while zh-HK and zh-TW use alternative style. We default to use GB-style.
|
||||
!(lang == Lang::CHINESE
|
||||
&& matches!(region.as_ref().map(Region::as_str), Some("TW" | "HK")))
|
||||
}
|
||||
|
||||
/// Calculate stretchability and shrinkability of each glyph,
|
||||
/// and CJK punctuation adjustments according to Chinese Layout Requirements.
|
||||
fn calculate_adjustability(ctx: &mut ShapingContext, lang: Lang, region: Option<Region>) {
|
||||
let gb_style = is_gb_style(lang, region);
|
||||
|
||||
let mut glyphs = ctx.glyphs.iter_mut().peekable();
|
||||
while let Some(glyph) = glyphs.next() {
|
||||
glyph.adjustability = glyph.base_adjustability(gb_style);
|
||||
|
||||
// Only GB style needs further adjustment.
|
||||
if glyph.is_cjk_adjustable() && !gb_style {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Now we apply consecutive punctuation adjustment, specified in Chinese Layout
|
||||
// Requirements, section 3.1.6.1 Punctuation Adjustment Space, and Japanese Layout
|
||||
// Requirements, section 3.1 Line Composition Rules for Punctuation Marks
|
||||
let Some(next) = glyphs.peek_mut() else { continue };
|
||||
let width = glyph.x_advance;
|
||||
let delta = width / 2.0;
|
||||
if next.is_cjk_adjustable()
|
||||
&& (glyph.shrinkability().1 + next.shrinkability().0) >= delta
|
||||
{
|
||||
let left_delta = glyph.shrinkability().1.min(delta);
|
||||
glyph.shrink_right(left_delta);
|
||||
next.shrink_left(delta - left_delta);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Difference between non-breaking and normal space.
|
||||
fn nbsp_delta(font: &Font) -> Option<Em> {
|
||||
let space = font.ttf().glyph_index(' ')?.0;
|
||||
|
Binary file not shown.
Before Width: | Height: | Size: 118 KiB After Width: | Height: | Size: 68 KiB |
Binary file not shown.
Before Width: | Height: | Size: 36 KiB After Width: | Height: | Size: 35 KiB |
Binary file not shown.
Before Width: | Height: | Size: 21 KiB After Width: | Height: | Size: 20 KiB |
@ -5,7 +5,7 @@
|
||||
// Most Chinese publications do not use hanging punctuation at line end.
|
||||
#set page(width: auto)
|
||||
#set par(justify: true)
|
||||
#set text(overhang: false, lang: "zh")
|
||||
#set text(font: "Noto Serif CJK SC", lang: "zh", overhang: false)
|
||||
|
||||
#rect(inset: 0pt, width: 80pt, fill: rgb("eee"))[
|
||||
中文维基百科使用汉字书写,汉字是汉族或华人的共同文字,是中国大陆、新加坡、马来西亚、台湾、香港、澳门的唯一官方文字或官方文字之一。25.9%,而美国和荷兰则分別占13.7%及8.2%。近年來,中国大陆地区的维基百科编辑者正在迅速增加;
|
||||
@ -33,4 +33,17 @@
|
||||
《书名》《测试》下一行
|
||||
|
||||
《书名》《测试》。
|
||||
]
|
||||
]
|
||||
|
||||
---
|
||||
// Test Variants of Mainland China, Hong Kong, and Japan.
|
||||
|
||||
// 17 characters a line.
|
||||
#set page(width: 170pt + 10pt, margin: (x: 5pt))
|
||||
#set text(font: "Noto Serif CJK SC", lang: "zh", overhang: false)
|
||||
#set par(justify: true)
|
||||
|
||||
孔雀最早见于《山海经》中的《海内经》:\u{200b}“有孔雀。”东汉杨孚著《异物志》记载,岭南:“孔雀,其大如大雁而足高,毛皆有斑纹彩,捕而蓄之,拍手即舞。”
|
||||
|
||||
#set text(font: "Noto Serif CJK TC", lang: "zh", region: "hk")
|
||||
孔雀最早见于《山海经》中的《海内经》:「有孔雀。」东汉杨孚著《异物志》记载,岭南:「孔雀,其大如大雁而足高,毛皆有斑纹彩,捕而蓄之,拍手即舞。」
|
||||
|
Loading…
x
Reference in New Issue
Block a user