Segment by script

This commit is contained in:
Laurenz 2022-04-12 21:36:37 +02:00
parent 56968bc0d6
commit c3a387b8f7
9 changed files with 70 additions and 45 deletions

1
Cargo.lock generated
View File

@ -873,6 +873,7 @@ dependencies = [
"typed-arena",
"typst-macros",
"unicode-bidi",
"unicode-script",
"unicode-segmentation",
"unicode-xid",
"usvg",

View File

@ -30,6 +30,7 @@ rustybuzz = "0.4"
unicode-bidi = "0.3.5"
unicode-segmentation = "1"
unicode-xid = "0.2"
unicode-script = "0.5"
xi-unicode = "0.3"
# Raster and vector graphics handling

Binary file not shown.

View File

@ -1,13 +1,14 @@
use std::sync::Arc;
use unicode_bidi::{BidiInfo, Level};
use unicode_script::{Script, UnicodeScript};
use xi_unicode::LineBreakIterator;
use super::{shape, Lang, ShapedText, TextNode};
use crate::font::FontStore;
use crate::library::layout::Spacing;
use crate::library::prelude::*;
use crate::util::{ArcExt, EcoString, SliceExt};
use crate::util::{ArcExt, EcoString};
/// Arrange text, spacing and inline-level nodes into a paragraph.
#[derive(Hash)]
@ -437,23 +438,46 @@ fn prepare<'a>(
_ => None,
});
let mut items = vec![];
let mut cursor = 0;
let mut items = vec![];
// Layout the children and collect them into items.
for (segment, styles) in segments {
let end = cursor + segment.len();
match segment {
Segment::Text(len) => {
// TODO: Also split by script.
let mut start = cursor;
for (level, count) in bidi.levels[cursor .. cursor + len].group() {
let end = start + count;
let text = &bidi.text[start .. end];
Segment::Text(_) => {
let mut process = |text, level: Level| {
let dir = if level.is_ltr() { Dir::LTR } else { Dir::RTL };
let shaped = shape(&mut ctx.fonts, text, styles, dir);
items.push(Item::Text(shaped));
start = end;
};
let mut prev_level = Level::ltr();
let mut prev_script = Script::Unknown;
// Group by embedding level and script.
for i in cursor .. end {
if !text.is_char_boundary(i) {
continue;
}
let level = bidi.levels[i];
let script =
text[i ..].chars().next().map_or(Script::Unknown, |c| c.script());
if level != prev_level || !is_compatible(script, prev_script) {
if cursor < i {
process(&text[cursor .. i], prev_level);
}
cursor = i;
prev_level = level;
prev_script = script;
} else if is_generic_script(prev_script) {
prev_script = script;
}
}
process(&text[cursor .. end], prev_level);
}
Segment::Spacing(spacing) => match spacing {
Spacing::Relative(v) => {
@ -482,12 +506,22 @@ fn prepare<'a>(
}
}
cursor += segment.len();
cursor = end;
}
Ok(Preparation { bidi, items, styles, children: &par.0 })
}
/// Whether this is not a specific script.
fn is_generic_script(script: Script) -> bool {
matches!(script, Script::Unknown | Script::Common | Script::Inherited)
}
/// Whether these script can be part of the same shape run.
fn is_compatible(a: Script, b: Script) -> bool {
is_generic_script(a) || is_generic_script(b) || a == b
}
/// Find suitable linebreaks.
fn linebreak<'a>(
p: &'a Preparation<'a>,

View File

@ -12,7 +12,6 @@ use crate::util::SliceExt;
/// This type contains owned or borrowed shaped text runs, which can be
/// measured, used to reshape substrings more quickly and converted into a
/// frame.
#[derive(Debug, Clone)]
pub struct ShapedText<'a> {
/// The text that was shaped.
pub text: &'a str,
@ -269,11 +268,13 @@ impl<'a> ShapedText<'a> {
// RTL needs offset one because the left side of the range should be
// exclusive and the right side inclusive, contrary to the normal
// behaviour of ranges.
if !ltr {
idx += 1;
}
self.glyphs[idx].safe_to_break.then(|| idx + (!ltr) as usize)
}
}
self.glyphs[idx].safe_to_break.then(|| idx)
impl Debug for ShapedText<'_> {
fn fmt(&self, f: &mut Formatter) -> fmt::Result {
self.text.fmt(f)
}
}

View File

@ -103,12 +103,6 @@ where
/// Additional methods for slices.
pub trait SliceExt<T> {
/// Find consecutive runs of the same elements in a slice and yield for
/// each such run the element and number of times it appears.
fn group(&self) -> Group<'_, T>
where
T: PartialEq;
/// Split a slice into consecutive runs with the same key and yield for
/// each such run the key and the slice of elements with that key.
fn group_by_key<K, F>(&self, f: F) -> GroupByKey<'_, T, F>
@ -118,35 +112,11 @@ pub trait SliceExt<T> {
}
impl<T> SliceExt<T> for [T] {
fn group(&self) -> Group<'_, T> {
Group { slice: self }
}
fn group_by_key<K, F>(&self, f: F) -> GroupByKey<'_, T, F> {
GroupByKey { slice: self, f }
}
}
/// This struct is created by [`SliceExt::group`].
pub struct Group<'a, T> {
slice: &'a [T],
}
impl<'a, T> Iterator for Group<'a, T>
where
T: PartialEq,
{
type Item = (&'a T, usize);
fn next(&mut self) -> Option<Self::Item> {
let mut iter = self.slice.iter();
let first = iter.next()?;
let count = 1 + iter.take_while(|&t| t == first).count();
self.slice = &self.slice[count ..];
Some((first, count))
}
}
/// This struct is created by [`SliceExt::group_by_key`].
pub struct GroupByKey<'a, T, F> {
slice: &'a [T],

Binary file not shown.

Before

Width:  |  Height:  |  Size: 18 KiB

After

Width:  |  Height:  |  Size: 2.6 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 6.3 KiB

After

Width:  |  Height:  |  Size: 6.3 KiB

View File

@ -0,0 +1,18 @@
// Test shaping quirks.
---
// Test separation by script.
ABCअपार्टमेंट
// This is how it should look like.
अपार्टमेंट
// This (without the spaces) is how it would look
// if we didn't separate by script.
अ पा र् ट में ट
---
// Test that RTL safe-to-break doesn't panic even though newline
// doesn't exist in shaping output.
#set text(dir: rtl, "Noto Serif Hebrew")
\ ט