diff --git a/Cargo.lock b/Cargo.lock index 14a297fdf..3fd1e3b8c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -116,6 +116,12 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" +[[package]] +name = "az" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b7e4c2464d97fe331d41de9d5db0def0a96f4d823b8b32a2efd503578988973" + [[package]] name = "base64" version = "0.13.1" @@ -1385,9 +1391,9 @@ checksum = "8835116a5c179084a830efb3adc117ab007512b535bc1a21c991d3b32a6b44dd" [[package]] name = "pdf-writer" -version = "0.7.0" +version = "0.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "63f45f7c7538e67c58cb4977e4f97bbd75fbd3990d827d28d597ec746291f644" +checksum = "30900f178ea696fc5d9637171f98aaa93d5aae54f0726726df68fc3e32810db6" dependencies = [ "bitflags 1.3.2", "itoa", @@ -2306,6 +2312,7 @@ dependencies = [ "tracing", "ttf-parser", "typst-macros", + "unicode-general-category", "unicode-math-class", "unicode-segmentation", "unicode-xid", @@ -2366,6 +2373,7 @@ dependencies = [ name = "typst-library" version = "0.3.0" dependencies = [ + "az", "chinese-number", "comemo", "csv", diff --git a/Cargo.toml b/Cargo.toml index a0e51002d..1c404061c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -33,7 +33,7 @@ indexmap = "1.9.3" log = "0.4" miniz_oxide = "0.7" once_cell = "1" -pdf-writer = "0.7" +pdf-writer = "0.7.1" pixglyph = "0.1" regex = "1" resvg = { version = "0.32", default-features = false } @@ -46,6 +46,7 @@ svg2pdf = { git = "https://github.com/typst/svg2pdf" } tiny-skia = "0.9.0" tracing = "0.1.37" ttf-parser = "0.18.1" +unicode-general-category = "0.6" unicode-math-class = "0.1" unicode-segmentation = "1" unicode-xid = "0.2" diff --git a/assets/fonts/IBMPlexSansDevanagari-Regular.ttf b/assets/fonts/IBMPlexSansDevanagari-Regular.ttf new file mode 100644 index 000000000..5d7c8f0f1 Binary files /dev/null and b/assets/fonts/IBMPlexSansDevanagari-Regular.ttf differ diff --git a/docs/src/reference/details.yml b/docs/src/reference/details.yml index 1926fb774..22b679632 100644 --- a/docs/src/reference/details.yml +++ b/docs/src/reference/details.yml @@ -159,7 +159,7 @@ construct: | data-loading: | Data loading from external files. - These functions help you with embedding data from experiments and APIs in your + These functions help you with embedding data from experiments in your documents. utility: | diff --git a/library/Cargo.toml b/library/Cargo.toml index 033058f3d..499170cb9 100644 --- a/library/Cargo.toml +++ b/library/Cargo.toml @@ -16,6 +16,7 @@ bench = false [dependencies] typst = { path = ".." } +az = "1.2" chinese-number = { version = "0.7.2", default-features = false, features = ["number-to-chinese"] } comemo = "0.2.2" csv = "1" diff --git a/library/src/layout/par.rs b/library/src/layout/par.rs index a6ad647bd..0c3a9a3c5 100644 --- a/library/src/layout/par.rs +++ b/library/src/layout/par.rs @@ -1139,8 +1139,7 @@ fn line<'a>( // are no other items in the line. if hyphen || start + shaped.text.len() > range.end { if hyphen || start < range.end || before.is_empty() { - let shifted = start - base..range.end - base; - let mut reshaped = shaped.reshape(vt, &p.spans, shifted); + let mut reshaped = shaped.reshape(vt, &p.spans, start..range.end); if hyphen || shy { reshaped.push_hyphen(vt); } @@ -1162,8 +1161,7 @@ fn line<'a>( // Reshape if necessary. if range.start + shaped.text.len() > end { if range.start < end { - let shifted = range.start - base..end - base; - let reshaped = shaped.reshape(vt, &p.spans, shifted); + let reshaped = shaped.reshape(vt, &p.spans, range.start..end); width += reshaped.width; first = Some(Item::Text(reshaped)); } diff --git a/library/src/math/fragment.rs b/library/src/math/fragment.rs index b09916301..40dca3479 100644 --- a/library/src/math/fragment.rs +++ b/library/src/math/fragment.rs @@ -222,13 +222,13 @@ impl GlyphFragment { size: self.font_size, fill: self.fill, lang: self.lang, + text: self.c.into(), glyphs: vec![Glyph { id: self.id.0, - c: self.c, x_advance: Em::from_length(self.width, self.font_size), x_offset: Em::zero(), - span: self.span, - offset: 0, + range: 0..self.c.len_utf8() as u16, + span: (self.span, 0), }], }; let size = Size::new(self.width, self.ascent + self.descent); diff --git a/library/src/text/shaping.rs b/library/src/text/shaping.rs index 1e1ccc992..7d5703bc2 100644 --- a/library/src/text/shaping.rs +++ b/library/src/text/shaping.rs @@ -1,6 +1,7 @@ use std::ops::Range; use std::str::FromStr; +use az::SaturatingAs; use rustybuzz::{Feature, Tag, UnicodeBuffer}; use typst::font::{Font, FontVariant}; use typst::util::SliceExt; @@ -47,20 +48,18 @@ pub struct ShapedGlyph { pub x_offset: Em, /// The vertical offset of the glyph. pub y_offset: Em, - /// The byte index in the source text where this glyph's cluster starts. A - /// cluster is a sequence of one or multiple glyphs that cannot be - /// separated and must always be treated as a union. - pub cluster: usize, + /// The byte range of this glyph's cluster in the full paragraph. A cluster + /// is a sequence of one or multiple glyphs that cannot be separated and + /// must always be treated as a union. + pub range: Range, /// Whether splitting the shaping result before this glyph would yield the /// same results as shaping the parts to both sides of `text_index` /// separately. pub safe_to_break: bool, /// The first char in this glyph's cluster. pub c: char, - /// The source code location of the text. - pub span: Span, - /// The offset within the spanned text. - pub offset: u16, + /// The source code location of the glyph and its byte offset within it. + pub span: (Span, u16), } #[derive(Debug, Clone, Default)] @@ -181,6 +180,12 @@ impl<'a> ShapedText<'a> { for ((font, y_offset), group) in self.glyphs.as_ref().group_by_key(|g| (g.font.clone(), g.y_offset)) { + let mut range = group[0].range.clone(); + for glyph in group { + range.start = range.start.min(glyph.range.start); + range.end = range.end.max(glyph.range.end); + } + let pos = Point::new(offset, top + shift - y_offset.at(self.size)); let glyphs = group .iter() @@ -195,8 +200,8 @@ impl<'a> ShapedText<'a> { } else { glyph.stretchability().1 }; - let justification_left = adjustability_left * justification_ratio; + let justification_left = adjustability_left * justification_ratio; let mut justification_right = adjustability_right * justification_ratio; if glyph.is_justifiable() { @@ -206,15 +211,16 @@ impl<'a> ShapedText<'a> { frame.size_mut().x += justification_left.at(self.size) + justification_right.at(self.size); + Glyph { id: glyph.glyph_id, x_advance: glyph.x_advance + justification_left + justification_right, x_offset: glyph.x_offset + justification_left, - c: glyph.c, + range: (glyph.range.start - range.start).saturating_as() + ..(glyph.range.end - range.start).saturating_as(), span: glyph.span, - offset: glyph.offset, } }) .collect(); @@ -224,6 +230,7 @@ impl<'a> ShapedText<'a> { size: self.size, lang, fill: fill.clone(), + text: self.text[range.start - self.base..range.end - self.base].into(), glyphs, }; @@ -318,16 +325,19 @@ impl<'a> ShapedText<'a> { /// Reshape a range of the shaped text, reusing information from this /// shaping process if possible. + /// + /// The text `range` is relative to the whole paragraph. pub fn reshape( &'a self, vt: &Vt, spans: &SpanMapper, text_range: Range, ) -> ShapedText<'a> { + let text = &self.text[text_range.start - self.base..text_range.end - self.base]; if let Some(glyphs) = self.slice_safe_to_break(text_range.clone()) { Self { - base: self.base + text_range.start, - text: &self.text[text_range], + base: text_range.start, + text, dir: self.dir, styles: self.styles, size: self.size, @@ -336,14 +346,7 @@ impl<'a> ShapedText<'a> { glyphs: Cow::Borrowed(glyphs), } } else { - shape( - vt, - self.base + text_range.start, - &self.text[text_range], - spans, - self.styles, - self.dir, - ) + shape(vt, text_range.start, text, spans, self.styles, self.dir) } } @@ -358,7 +361,11 @@ impl<'a> ShapedText<'a> { let ttf = font.ttf(); let glyph_id = ttf.glyph_index('-')?; let x_advance = font.to_em(ttf.glyph_hor_advance(glyph_id)?); - let cluster = self.glyphs.last().map(|g| g.cluster).unwrap_or_default(); + let range = self + .glyphs + .last() + .map(|g| g.range.end..g.range.end) + .unwrap_or_default(); self.width += x_advance.at(self.size); self.glyphs.to_mut().push(ShapedGlyph { font, @@ -366,11 +373,10 @@ impl<'a> ShapedText<'a> { x_advance, x_offset: Em::zero(), y_offset: Em::zero(), - cluster, + range, safe_to_break: true, c: '-', - span: Span::detached(), - offset: 0, + span: (Span::detached(), 0), }); Some(()) }); @@ -396,9 +402,9 @@ impl<'a> ShapedText<'a> { // Handle edge cases. let len = self.glyphs.len(); - if text_index == 0 { + if text_index == self.base { return Some(if ltr { 0 } else { len }); - } else if text_index == self.text.len() { + } else if text_index == self.base + self.text.len() { return Some(if ltr { len } else { 0 }); } @@ -406,7 +412,7 @@ impl<'a> ShapedText<'a> { let mut idx = self .glyphs .binary_search_by(|g| { - let ordering = g.cluster.cmp(&text_index); + let ordering = g.range.start.cmp(&text_index); if ltr { ordering } else { @@ -422,7 +428,7 @@ impl<'a> ShapedText<'a> { // Search for the outermost glyph with the text index. while let Some(next) = next(idx, 1) { - if self.glyphs.get(next).map_or(true, |g| g.cluster != text_index) { + if self.glyphs.get(next).map_or(true, |g| g.range.start != text_index) { break; } idx = next; @@ -444,7 +450,6 @@ impl Debug for ShapedText<'_> { /// Holds shaping results and metadata common to all shaped segments. struct ShapingContext<'a> { vt: &'a Vt<'a>, - base: usize, spans: &'a SpanMapper, glyphs: Vec, used: Vec, @@ -468,7 +473,6 @@ pub fn shape<'a>( let size = TextElem::size_in(styles); let mut ctx = ShapingContext { vt, - base, spans, size, glyphs: vec![], @@ -481,7 +485,7 @@ pub fn shape<'a>( }; if !text.is_empty() { - shape_segment(&mut ctx, 0, text, families(styles)); + shape_segment(&mut ctx, base, text, families(styles)); } track_and_space(&mut ctx); @@ -552,6 +556,7 @@ fn shape_segment( let buffer = rustybuzz::shape(font.rusty(), &ctx.tags, buffer); let infos = buffer.glyph_infos(); let pos = buffer.glyph_positions(); + let ltr = ctx.dir.is_positive(); // Collect the shaped glyphs, doing fallback and shaping parts again with // the next font if necessary. @@ -560,68 +565,66 @@ fn shape_segment( let info = &infos[i]; let cluster = info.cluster as usize; + // Add the glyph to the shaped output. if info.glyph_id != 0 { - // Add the glyph to the shaped output. - // TODO: Don't ignore y_advance. - let (span, offset) = ctx.spans.span_at(ctx.base + cluster); - ctx.glyphs.push(ShapedGlyph { - font: font.clone(), - glyph_id: info.glyph_id as u16, - x_advance: font.to_em(pos[i].x_advance), - x_offset: font.to_em(pos[i].x_offset), - y_offset: font.to_em(pos[i].y_offset), - cluster: base + cluster, - safe_to_break: !info.unsafe_to_break(), - c: text[cluster..].chars().next().unwrap(), - span, - offset, - }); - } else { - // Determine the source text range for the tofu sequence. - let range = { - // First, search for the end of the tofu sequence. - let k = i; - while infos.get(i + 1).map_or(false, |info| info.glyph_id == 0) { - i += 1; - } - - // Then, determine the start and end text index. - // - // Examples: - // Everything is shown in visual order. Tofus are written as "_". - // We want to find out that the tofus span the text `2..6`. - // Note that the clusters are longer than 1 char. - // - // Left-to-right: - // Text: h a l i h a l l o - // Glyphs: A _ _ C E - // Clusters: 0 2 4 6 8 - // k=1 i=2 - // - // Right-to-left: - // Text: O L L A H I L A H - // Glyphs: E C _ _ A - // Clusters: 8 6 4 2 0 - // k=2 i=3 - let ltr = ctx.dir.is_positive(); - let first = if ltr { k } else { i }; - let start = infos[first].cluster as usize; - let last = if ltr { i.checked_add(1) } else { k.checked_sub(1) }; - let end = last + // Determine the text range of the glyph. + let start = base + cluster; + let end = base + + if ltr { i.checked_add(1) } else { i.checked_sub(1) } .and_then(|last| infos.get(last)) .map_or(text.len(), |info| info.cluster as usize); - start..end - }; + ctx.glyphs.push(ShapedGlyph { + font: font.clone(), + glyph_id: info.glyph_id as u16, + // TODO: Don't ignore y_advance. + x_advance: font.to_em(pos[i].x_advance), + x_offset: font.to_em(pos[i].x_offset), + y_offset: font.to_em(pos[i].y_offset), + range: start..end, + safe_to_break: !info.unsafe_to_break(), + c: text[cluster..].chars().next().unwrap(), + span: ctx.spans.span_at(start), + }); + } else { + // First, search for the end of the tofu sequence. + let k = i; + while infos.get(i + 1).map_or(false, |info| info.glyph_id == 0) { + i += 1; + } + + // Then, determine the start and end text index for the tofu + // sequence. + // + // Examples: + // Everything is shown in visual order. Tofus are written as "_". + // We want to find out that the tofus span the text `2..6`. + // Note that the clusters are longer than 1 char. + // + // Left-to-right: + // Text: h a l i h a l l o + // Glyphs: A _ _ C E + // Clusters: 0 2 4 6 8 + // k=1 i=2 + // + // Right-to-left: + // Text: O L L A H I L A H + // Glyphs: E C _ _ A + // Clusters: 8 6 4 2 0 + // k=2 i=3 + let start = infos[if ltr { k } else { i }].cluster as usize; + let end = if ltr { i.checked_add(1) } else { k.checked_sub(1) } + .and_then(|last| infos.get(last)) + .map_or(text.len(), |info| info.cluster as usize); // Trim half-baked cluster. - let remove = base + range.start..base + range.end; - while ctx.glyphs.last().map_or(false, |g| remove.contains(&g.cluster)) { + let remove = base + start..base + end; + while ctx.glyphs.last().map_or(false, |g| remove.contains(&g.range.start)) { ctx.glyphs.pop(); } // Recursively shape the tofu sequence with the next family. - shape_segment(ctx, base + range.start, &text[range], families.clone()); + shape_segment(ctx, base + start, &text[start..end], families.clone()); } i += 1; @@ -634,19 +637,18 @@ fn shape_segment( fn shape_tofus(ctx: &mut ShapingContext, base: usize, text: &str, font: Font) { let x_advance = font.advance(0).unwrap_or_default(); for (cluster, c) in text.char_indices() { - let cluster = base + cluster; - let (span, offset) = ctx.spans.span_at(ctx.base + cluster); + let start = base + cluster; + let end = start + c.len_utf8(); ctx.glyphs.push(ShapedGlyph { font: font.clone(), glyph_id: 0, x_advance, x_offset: Em::zero(), y_offset: Em::zero(), - cluster, + range: start..end, safe_to_break: true, c, - span, - offset, + span: ctx.spans.span_at(start), }); } } @@ -668,7 +670,10 @@ fn track_and_space(ctx: &mut ShapingContext) { glyph.x_advance = spacing.relative_to(glyph.x_advance); } - if glyphs.peek().map_or(false, |next| glyph.cluster != next.cluster) { + if glyphs + .peek() + .map_or(false, |next| glyph.range.start != next.range.start) + { glyph.x_advance += tracking; } } diff --git a/src/doc.rs b/src/doc.rs index 0a744ffcf..0a9b9a8dd 100644 --- a/src/doc.rs +++ b/src/doc.rs @@ -1,7 +1,8 @@ //! Finished documents. -use std::fmt::{self, Debug, Formatter, Write}; +use std::fmt::{self, Debug, Formatter}; use std::num::NonZeroUsize; +use std::ops::Range; use std::str::FromStr; use std::sync::Arc; @@ -114,23 +115,6 @@ impl Frame { pub fn items(&self) -> std::slice::Iter<'_, (Point, FrameItem)> { self.items.iter() } - - /// Approximately recover the text inside of the frame and its children. - pub fn text(&self) -> EcoString { - let mut text = EcoString::new(); - for (_, item) in self.items() { - match item { - FrameItem::Text(item) => { - for glyph in &item.glyphs { - text.push(glyph.c); - } - } - FrameItem::Group(group) => text.push_str(&group.frame.text()), - _ => {} - } - } - text - } } /// Insert items and subframes. @@ -476,6 +460,8 @@ pub struct TextItem { pub fill: Paint, /// The natural language of the text. pub lang: Lang, + /// The item's plain text. + pub text: EcoString, /// The glyphs. pub glyphs: Vec, } @@ -489,19 +475,14 @@ impl TextItem { impl Debug for TextItem { fn fmt(&self, f: &mut Formatter) -> fmt::Result { - // This is only a rough approximation of the source text. - f.write_str("Text(\"")?; - for glyph in &self.glyphs { - for c in glyph.c.escape_debug() { - f.write_char(c)?; - } - } - f.write_str("\")") + f.write_str("Text(")?; + self.text.fmt(f)?; + f.write_str(")") } } /// A glyph in a run of shaped text. -#[derive(Debug, Copy, Clone, Eq, PartialEq, Hash)] +#[derive(Debug, Clone, Eq, PartialEq, Hash)] pub struct Glyph { /// The glyph's index in the font. pub id: u16, @@ -509,12 +490,17 @@ pub struct Glyph { pub x_advance: Em, /// The horizontal offset of the glyph. pub x_offset: Em, - /// The first character of the glyph's cluster. - pub c: char, + /// The range of the glyph in its item's text. + pub range: Range, /// The source code location of the text. - pub span: Span, - /// The offset within the spanned text. - pub offset: u16, + pub span: (Span, u16), +} + +impl Glyph { + /// The range of the glyph in its item's text. + pub fn range(&self) -> Range { + usize::from(self.range.start)..usize::from(self.range.end) + } } /// An identifier for a natural language. diff --git a/src/export/pdf/font.rs b/src/export/pdf/font.rs index de79976ab..1e2f9c93b 100644 --- a/src/export/pdf/font.rs +++ b/src/export/pdf/font.rs @@ -1,13 +1,21 @@ use std::collections::BTreeMap; -use ecow::eco_format; +use ecow::{eco_format, EcoString}; use pdf_writer::types::{CidFontType, FontFlags, SystemInfo, UnicodeCmap}; use pdf_writer::{Filter, Finish, Name, Rect, Str}; use ttf_parser::{name_id, GlyphId, Tag}; +use unicode_general_category::GeneralCategory; use super::{deflate, EmExt, PdfContext, RefExt}; use crate::util::SliceExt; +const CMAP_NAME: Name = Name(b"Custom"); +const SYSTEM_INFO: SystemInfo = SystemInfo { + registry: Str(b"Adobe"), + ordering: Str(b"Identity"), + supplement: 0, +}; + /// Embed all used fonts into the PDF. #[tracing::instrument(skip_all)] pub fn write_fonts(ctx: &mut PdfContext) { @@ -19,7 +27,7 @@ pub fn write_fonts(ctx: &mut PdfContext) { let data_ref = ctx.alloc.bump(); ctx.font_refs.push(type0_ref); - let glyphs = &ctx.glyph_sets[font]; + let glyph_set = ctx.glyph_sets.get_mut(font).unwrap(); let metrics = font.metrics(); let ttf = font.ttf(); @@ -29,12 +37,6 @@ pub fn write_fonts(ctx: &mut PdfContext) { let base_font = eco_format!("ABCDEF+{}", postscript_name); let base_font = Name(base_font.as_bytes()); - let cmap_name = Name(b"Custom"); - let system_info = SystemInfo { - registry: Str(b"Adobe"), - ordering: Str(b"Identity"), - supplement: 0, - }; // Write the base font object referencing the CID font. ctx.writer @@ -59,7 +61,7 @@ pub fn write_fonts(ctx: &mut PdfContext) { let mut cid = ctx.writer.cid_font(cid_ref); cid.subtype(subtype); cid.base_font(base_font); - cid.system_info(system_info); + cid.system_info(SYSTEM_INFO); cid.font_descriptor(descriptor_ref); cid.default_width(0.0); @@ -70,7 +72,7 @@ pub fn write_fonts(ctx: &mut PdfContext) { // Extract the widths of all glyphs. let num_glyphs = ttf.number_of_glyphs(); let mut widths = vec![0.0; num_glyphs as usize]; - for &g in glyphs { + for &g in glyph_set.keys() { let x = ttf.glyph_hor_advance(GlyphId(g)).unwrap_or(0); widths[g as usize] = font.to_em(x).to_font_units(); } @@ -130,42 +132,15 @@ pub fn write_fonts(ctx: &mut PdfContext) { font_descriptor.finish(); - // Compute a reverse mapping from glyphs to unicode. - let cmap = { - let mut mapping = BTreeMap::new(); - for subtable in - ttf.tables().cmap.into_iter().flat_map(|table| table.subtables) - { - if subtable.is_unicode() { - subtable.codepoints(|n| { - if let Some(c) = std::char::from_u32(n) { - if let Some(GlyphId(g)) = ttf.glyph_index(c) { - if glyphs.contains(&g) { - mapping.insert(g, c); - } - } - } - }); - } - } - - let mut cmap = UnicodeCmap::new(cmap_name, system_info); - for (g, c) in mapping { - cmap.pair(g, c); - } - cmap - }; - // Write the /ToUnicode character map, which maps glyph ids back to // unicode codepoints to enable copying out of the PDF. - ctx.writer - .cmap(cmap_ref, &deflate(&cmap.finish())) - .filter(Filter::FlateDecode); + let cmap = create_cmap(ttf, glyph_set); + ctx.writer.cmap(cmap_ref, &cmap.finish()); // Subset and write the font's bytes. let data = font.data(); let subsetted = { - let glyphs: Vec<_> = glyphs.iter().copied().collect(); + let glyphs: Vec<_> = glyph_set.keys().copied().collect(); let profile = subsetter::Profile::pdf(&glyphs); subsetter::subset(data, font.index(), profile) }; @@ -183,3 +158,44 @@ pub fn write_fonts(ctx: &mut PdfContext) { stream.finish(); } } + +/// Create a /ToUnicode CMap. +fn create_cmap( + ttf: &ttf_parser::Face, + glyph_set: &mut BTreeMap, +) -> UnicodeCmap { + // For glyphs that have codepoints mapping to in the font's cmap table, we + // prefer them over pre-existing text mappings from the document. Only + // things that don't have a corresponding codepoint (or only a private-use + // one) like the "Th" in Linux Libertine get the text of their first + // occurances in the document instead. + for subtable in ttf.tables().cmap.into_iter().flat_map(|table| table.subtables) { + if !subtable.is_unicode() { + continue; + } + + subtable.codepoints(|n| { + let Some(c) = std::char::from_u32(n) else { return }; + if unicode_general_category::get_general_category(c) + == GeneralCategory::PrivateUse + { + return; + } + + let Some(GlyphId(g)) = ttf.glyph_index(c) else { return }; + if glyph_set.contains_key(&g) { + glyph_set.insert(g, c.into()); + } + }); + } + + // Produce a reverse mapping from glyphs to unicode strings. + let mut cmap = UnicodeCmap::new(CMAP_NAME, SYSTEM_INFO); + for (&g, text) in glyph_set.iter() { + if !text.is_empty() { + cmap.pair_with_multiple(g, text.chars()); + } + } + + cmap +} diff --git a/src/export/pdf/mod.rs b/src/export/pdf/mod.rs index ffbf67a35..484858628 100644 --- a/src/export/pdf/mod.rs +++ b/src/export/pdf/mod.rs @@ -6,9 +6,10 @@ mod outline; mod page; use std::cmp::Eq; -use std::collections::{HashMap, HashSet}; +use std::collections::{BTreeMap, HashMap}; use std::hash::Hash; +use ecow::EcoString; use pdf_writer::types::Direction; use pdf_writer::{Finish, Name, PdfWriter, Ref, TextStr}; use xmp_writer::{LangId, RenditionClass, XmpWriter}; @@ -52,7 +53,13 @@ pub struct PdfContext<'a> { page_refs: Vec, font_map: Remapper, image_map: Remapper, - glyph_sets: HashMap>, + /// For each font a mapping from used glyphs to their text representation. + /// May contain multiple chars in case of ligatures or similar things. The + /// same glyph can have a different text representation within one document, + /// then we just save the first one. The resulting strings are used for the + /// PDF's /ToUnicode map for glyphs that don't have an entry in the font's + /// cmap. This is important for copy-paste and searching. + glyph_sets: HashMap>, languages: HashMap, } diff --git a/src/export/pdf/page.rs b/src/export/pdf/page.rs index 35a4f5dcd..22e590d51 100644 --- a/src/export/pdf/page.rs +++ b/src/export/pdf/page.rs @@ -364,11 +364,12 @@ fn write_group(ctx: &mut PageContext, pos: Point, group: &GroupItem) { /// Encode a text run into the content stream. fn write_text(ctx: &mut PageContext, x: f32, y: f32, text: &TextItem) { *ctx.parent.languages.entry(text.lang).or_insert(0) += text.glyphs.len(); - ctx.parent - .glyph_sets - .entry(text.font.clone()) - .or_default() - .extend(text.glyphs.iter().map(|g| g.id)); + + let glyph_set = ctx.parent.glyph_sets.entry(text.font.clone()).or_default(); + for g in &text.glyphs { + let segment = &text.text[g.range()]; + glyph_set.entry(g.id).or_insert_with(|| segment.into()); + } ctx.set_fill(&text.fill); ctx.set_font(&text.font, text.size); diff --git a/src/ide/jump.rs b/src/ide/jump.rs index fc98747c7..42ed2ab5d 100644 --- a/src/ide/jump.rs +++ b/src/ide/jump.rs @@ -67,7 +67,8 @@ pub fn jump_from_click( FrameItem::Text(text) => { for glyph in &text.glyphs { - if glyph.span.is_detached() { + let (span, span_offset) = glyph.span; + if span.is_detached() { continue; } @@ -77,13 +78,13 @@ pub fn jump_from_click( Size::new(width, text.size), click, ) { - let source = world.source(glyph.span.source()); - let node = source.find(glyph.span)?; + let source = world.source(span.source()); + let node = source.find(span)?; let pos = if node.kind() == SyntaxKind::Text { let range = node.range(); - let mut offset = range.start + usize::from(glyph.offset); + let mut offset = range.start + usize::from(span_offset); if (click.x - pos.x) > width / 2.0 { - offset += glyph.c.len_utf8(); + offset += glyph.range().len(); } offset.min(range.end) } else { @@ -150,7 +151,7 @@ fn find_in_frame(frame: &Frame, span: Span) -> Option { if let FrameItem::Text(text) = item { for glyph in &text.glyphs { - if glyph.span == span { + if glyph.span.0 == span { return Some(pos); } pos.x += glyph.x_advance.at(text.size); diff --git a/tests/ref/text/copy-paste.png b/tests/ref/text/copy-paste.png new file mode 100644 index 000000000..cbbad9405 Binary files /dev/null and b/tests/ref/text/copy-paste.png differ diff --git a/tests/ref/text/shaping.png b/tests/ref/text/shaping.png index 7b33074fc..278fe8ee1 100644 Binary files a/tests/ref/text/shaping.png and b/tests/ref/text/shaping.png differ diff --git a/tests/src/tests.rs b/tests/src/tests.rs index 0e22084ce..2a0b74ea9 100644 --- a/tests/src/tests.rs +++ b/tests/src/tests.rs @@ -353,9 +353,18 @@ fn test( pdf_path: Option<&Path>, args: &Args, ) -> bool { - let name = src_path.strip_prefix(TYP_DIR).unwrap_or(src_path); + struct PanicGuard<'a>(&'a Path); + impl Drop for PanicGuard<'_> { + fn drop(&mut self) { + if std::thread::panicking() { + println!("Panicked in {}", self.0.display()); + } + } + } + let name = src_path.strip_prefix(TYP_DIR).unwrap_or(src_path); let text = fs::read_to_string(src_path).unwrap(); + let _guard = PanicGuard(name); let mut output = String::new(); let mut ok = true; @@ -401,6 +410,7 @@ fn test( line, &mut rng, ); + ok &= part_ok; compare_ever |= compare_here; frames.extend(part_frames); diff --git a/tests/typ/text/copy-paste.typ b/tests/typ/text/copy-paste.typ new file mode 100644 index 000000000..5d8264825 --- /dev/null +++ b/tests/typ/text/copy-paste.typ @@ -0,0 +1,8 @@ +// Test copy-paste and search in PDF with ligatures +// and Arabic test. Must be tested manually! + +--- +The after fira 🏳️‍🌈! + +#set text(lang: "ar", font: "Noto Sans Arabic") +مرحبًا