Make ligatures copyable and searchable

Fixes #479 Fixes #1040
2023-05-03 10:33:18 +02:00 · 2023-05-03 10:33:18 +02:00 · ad347632ab
commit ad347632ab
parent bcc014c4e1
17 changed files with 229 additions and 187 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -116,6 +116,12 @@ version = "1.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"

+[[package]]
+name = "az"
+version = "1.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7b7e4c2464d97fe331d41de9d5db0def0a96f4d823b8b32a2efd503578988973"
+
 [[package]]
 name = "base64"
 version = "0.13.1"
@ -1385,9 +1391,9 @@ checksum = "8835116a5c179084a830efb3adc117ab007512b535bc1a21c991d3b32a6b44dd"

 [[package]]
 name = "pdf-writer"
-version = "0.7.0"
+version = "0.7.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "63f45f7c7538e67c58cb4977e4f97bbd75fbd3990d827d28d597ec746291f644"
+checksum = "30900f178ea696fc5d9637171f98aaa93d5aae54f0726726df68fc3e32810db6"
 dependencies = [
 "bitflags 1.3.2",
 "itoa",
@ -2306,6 +2312,7 @@ dependencies = [
 "tracing",
 "ttf-parser",
 "typst-macros",
+ "unicode-general-category",
 "unicode-math-class",
 "unicode-segmentation",
 "unicode-xid",
@ -2366,6 +2373,7 @@ dependencies = [
 name = "typst-library"
 version = "0.3.0"
 dependencies = [
+ "az",
 "chinese-number",
 "comemo",
 "csv",
--- a/Cargo.toml
+++ b/Cargo.toml
@ -33,7 +33,7 @@ indexmap = "1.9.3"
 log = "0.4"
 miniz_oxide = "0.7"
 once_cell = "1"
-pdf-writer = "0.7"
+pdf-writer = "0.7.1"
 pixglyph = "0.1"
 regex = "1"
 resvg = { version = "0.32", default-features = false }
@ -46,6 +46,7 @@ svg2pdf = { git = "https://github.com/typst/svg2pdf" }
 tiny-skia = "0.9.0"
 tracing = "0.1.37"
 ttf-parser = "0.18.1"
+unicode-general-category = "0.6"
 unicode-math-class = "0.1"
 unicode-segmentation = "1"
 unicode-xid = "0.2"
--- a/assets/fonts/IBMPlexSansDevanagari-Regular.ttf
+++ b/assets/fonts/IBMPlexSansDevanagari-Regular.ttf
--- a/docs/src/reference/details.yml
+++ b/docs/src/reference/details.yml
@ -159,7 +159,7 @@ construct: |
 data-loading: |
  Data loading from external files.

-  These functions help you with embedding data from experiments and APIs in your
+  These functions help you with embedding data from experiments in your
  documents.

 utility: |
--- a/library/Cargo.toml
+++ b/library/Cargo.toml
@ -16,6 +16,7 @@ bench = false

 [dependencies]
 typst = { path = ".." }
+az = "1.2"
 chinese-number = { version = "0.7.2", default-features = false, features = ["number-to-chinese"] }
 comemo = "0.2.2"
 csv = "1"
--- a/library/src/layout/par.rs
+++ b/library/src/layout/par.rs
@ -1139,8 +1139,7 @@ fn line<'a>(
        // are no other items in the line.
        if hyphen || start + shaped.text.len() > range.end {
            if hyphen || start < range.end || before.is_empty() {
-                let shifted = start - base..range.end - base;
-                let mut reshaped = shaped.reshape(vt, &p.spans, shifted);
+                let mut reshaped = shaped.reshape(vt, &p.spans, start..range.end);
                if hyphen || shy {
                    reshaped.push_hyphen(vt);
                }
@ -1162,8 +1161,7 @@ fn line<'a>(
        // Reshape if necessary.
        if range.start + shaped.text.len() > end {
            if range.start < end {
-                let shifted = range.start - base..end - base;
-                let reshaped = shaped.reshape(vt, &p.spans, shifted);
+                let reshaped = shaped.reshape(vt, &p.spans, range.start..end);
                width += reshaped.width;
                first = Some(Item::Text(reshaped));
            }
--- a/library/src/math/fragment.rs
+++ b/library/src/math/fragment.rs
@ -222,13 +222,13 @@ impl GlyphFragment {
            size: self.font_size,
            fill: self.fill,
            lang: self.lang,
+            text: self.c.into(),
            glyphs: vec![Glyph {
                id: self.id.0,
-                c: self.c,
                x_advance: Em::from_length(self.width, self.font_size),
                x_offset: Em::zero(),
-                span: self.span,
-                offset: 0,
+                range: 0..self.c.len_utf8() as u16,
+                span: (self.span, 0),
            }],
        };
        let size = Size::new(self.width, self.ascent + self.descent);
--- a/library/src/text/shaping.rs
+++ b/library/src/text/shaping.rs
@ -1,6 +1,7 @@
 use std::ops::Range;
 use std::str::FromStr;

+use az::SaturatingAs;
 use rustybuzz::{Feature, Tag, UnicodeBuffer};
 use typst::font::{Font, FontVariant};
 use typst::util::SliceExt;
@ -47,20 +48,18 @@ pub struct ShapedGlyph {
    pub x_offset: Em,
    /// The vertical offset of the glyph.
    pub y_offset: Em,
-    /// The byte index in the source text where this glyph's cluster starts. A
-    /// cluster is a sequence of one or multiple glyphs that cannot be
-    /// separated and must always be treated as a union.
-    pub cluster: usize,
+    /// The byte range of this glyph's cluster in the full paragraph. A cluster
+    /// is a sequence of one or multiple glyphs that cannot be separated and
+    /// must always be treated as a union.
+    pub range: Range<usize>,
    /// Whether splitting the shaping result before this glyph would yield the
    /// same results as shaping the parts to both sides of `text_index`
    /// separately.
    pub safe_to_break: bool,
    /// The first char in this glyph's cluster.
    pub c: char,
-    /// The source code location of the text.
-    pub span: Span,
-    /// The offset within the spanned text.
-    pub offset: u16,
+    /// The source code location of the glyph and its byte offset within it.
+    pub span: (Span, u16),
 }

 #[derive(Debug, Clone, Default)]
@ -181,6 +180,12 @@ impl<'a> ShapedText<'a> {
        for ((font, y_offset), group) in
            self.glyphs.as_ref().group_by_key(|g| (g.font.clone(), g.y_offset))
        {
+            let mut range = group[0].range.clone();
+            for glyph in group {
+                range.start = range.start.min(glyph.range.start);
+                range.end = range.end.max(glyph.range.end);
+            }
+
            let pos = Point::new(offset, top + shift - y_offset.at(self.size));
            let glyphs = group
                .iter()
@ -195,8 +200,8 @@ impl<'a> ShapedText<'a> {
                    } else {
                        glyph.stretchability().1
                    };
-                    let justification_left = adjustability_left * justification_ratio;

+                    let justification_left = adjustability_left * justification_ratio;
                    let mut justification_right =
                        adjustability_right * justification_ratio;
                    if glyph.is_justifiable() {
@ -206,15 +211,16 @@ impl<'a> ShapedText<'a> {

                    frame.size_mut().x += justification_left.at(self.size)
                        + justification_right.at(self.size);
+
                    Glyph {
                        id: glyph.glyph_id,
                        x_advance: glyph.x_advance
                            + justification_left
                            + justification_right,
                        x_offset: glyph.x_offset + justification_left,
-                        c: glyph.c,
+                        range: (glyph.range.start - range.start).saturating_as()
+                            ..(glyph.range.end - range.start).saturating_as(),
                        span: glyph.span,
-                        offset: glyph.offset,
                    }
                })
                .collect();
@ -224,6 +230,7 @@ impl<'a> ShapedText<'a> {
                size: self.size,
                lang,
                fill: fill.clone(),
+                text: self.text[range.start - self.base..range.end - self.base].into(),
                glyphs,
            };

@ -318,16 +325,19 @@ impl<'a> ShapedText<'a> {

    /// Reshape a range of the shaped text, reusing information from this
    /// shaping process if possible.
+    ///
+    /// The text `range` is relative to the whole paragraph.
    pub fn reshape(
        &'a self,
        vt: &Vt,
        spans: &SpanMapper,
        text_range: Range<usize>,
    ) -> ShapedText<'a> {
+        let text = &self.text[text_range.start - self.base..text_range.end - self.base];
        if let Some(glyphs) = self.slice_safe_to_break(text_range.clone()) {
            Self {
-                base: self.base + text_range.start,
-                text: &self.text[text_range],
+                base: text_range.start,
+                text,
                dir: self.dir,
                styles: self.styles,
                size: self.size,
@ -336,14 +346,7 @@ impl<'a> ShapedText<'a> {
                glyphs: Cow::Borrowed(glyphs),
            }
        } else {
-            shape(
-                vt,
-                self.base + text_range.start,
-                &self.text[text_range],
-                spans,
-                self.styles,
-                self.dir,
-            )
+            shape(vt, text_range.start, text, spans, self.styles, self.dir)
        }
    }

@ -358,7 +361,11 @@ impl<'a> ShapedText<'a> {
            let ttf = font.ttf();
            let glyph_id = ttf.glyph_index('-')?;
            let x_advance = font.to_em(ttf.glyph_hor_advance(glyph_id)?);
-            let cluster = self.glyphs.last().map(|g| g.cluster).unwrap_or_default();
+            let range = self
+                .glyphs
+                .last()
+                .map(|g| g.range.end..g.range.end)
+                .unwrap_or_default();
            self.width += x_advance.at(self.size);
            self.glyphs.to_mut().push(ShapedGlyph {
                font,
@ -366,11 +373,10 @@ impl<'a> ShapedText<'a> {
                x_advance,
                x_offset: Em::zero(),
                y_offset: Em::zero(),
-                cluster,
+                range,
                safe_to_break: true,
                c: '-',
-                span: Span::detached(),
-                offset: 0,
+                span: (Span::detached(), 0),
            });
            Some(())
        });
@ -396,9 +402,9 @@ impl<'a> ShapedText<'a> {

        // Handle edge cases.
        let len = self.glyphs.len();
-        if text_index == 0 {
+        if text_index == self.base {
            return Some(if ltr { 0 } else { len });
-        } else if text_index == self.text.len() {
+        } else if text_index == self.base + self.text.len() {
            return Some(if ltr { len } else { 0 });
        }

@ -406,7 +412,7 @@ impl<'a> ShapedText<'a> {
        let mut idx = self
            .glyphs
            .binary_search_by(|g| {
-                let ordering = g.cluster.cmp(&text_index);
+                let ordering = g.range.start.cmp(&text_index);
                if ltr {
                    ordering
                } else {
@ -422,7 +428,7 @@ impl<'a> ShapedText<'a> {

        // Search for the outermost glyph with the text index.
        while let Some(next) = next(idx, 1) {
-            if self.glyphs.get(next).map_or(true, |g| g.cluster != text_index) {
+            if self.glyphs.get(next).map_or(true, |g| g.range.start != text_index) {
                break;
            }
            idx = next;
@ -444,7 +450,6 @@ impl Debug for ShapedText<'_> {
 /// Holds shaping results and metadata common to all shaped segments.
 struct ShapingContext<'a> {
    vt: &'a Vt<'a>,
-    base: usize,
    spans: &'a SpanMapper,
    glyphs: Vec<ShapedGlyph>,
    used: Vec<Font>,
@ -468,7 +473,6 @@ pub fn shape<'a>(
    let size = TextElem::size_in(styles);
    let mut ctx = ShapingContext {
        vt,
-        base,
        spans,
        size,
        glyphs: vec![],
@ -481,7 +485,7 @@ pub fn shape<'a>(
    };

    if !text.is_empty() {
-        shape_segment(&mut ctx, 0, text, families(styles));
+        shape_segment(&mut ctx, base, text, families(styles));
    }

    track_and_space(&mut ctx);
@ -552,6 +556,7 @@ fn shape_segment(
    let buffer = rustybuzz::shape(font.rusty(), &ctx.tags, buffer);
    let infos = buffer.glyph_infos();
    let pos = buffer.glyph_positions();
+    let ltr = ctx.dir.is_positive();

    // Collect the shaped glyphs, doing fallback and shaping parts again with
    // the next font if necessary.
@ -560,68 +565,66 @@ fn shape_segment(
        let info = &infos[i];
        let cluster = info.cluster as usize;

+        // Add the glyph to the shaped output.
        if info.glyph_id != 0 {
-            // Add the glyph to the shaped output.
-            // TODO: Don't ignore y_advance.
-            let (span, offset) = ctx.spans.span_at(ctx.base + cluster);
-            ctx.glyphs.push(ShapedGlyph {
-                font: font.clone(),
-                glyph_id: info.glyph_id as u16,
-                x_advance: font.to_em(pos[i].x_advance),
-                x_offset: font.to_em(pos[i].x_offset),
-                y_offset: font.to_em(pos[i].y_offset),
-                cluster: base + cluster,
-                safe_to_break: !info.unsafe_to_break(),
-                c: text[cluster..].chars().next().unwrap(),
-                span,
-                offset,
-            });
-        } else {
-            // Determine the source text range for the tofu sequence.
-            let range = {
-                // First, search for the end of the tofu sequence.
-                let k = i;
-                while infos.get(i + 1).map_or(false, |info| info.glyph_id == 0) {
-                    i += 1;
-                }
-
-                // Then, determine the start and end text index.
-                //
-                // Examples:
-                // Everything is shown in visual order. Tofus are written as "_".
-                // We want to find out that the tofus span the text `2..6`.
-                // Note that the clusters are longer than 1 char.
-                //
-                // Left-to-right:
-                // Text:     h a l i h a l l o
-                // Glyphs:   A   _   _   C   E
-                // Clusters: 0   2   4   6   8
-                //              k=1 i=2
-                //
-                // Right-to-left:
-                // Text:     O L L A H I L A H
-                // Glyphs:   E   C   _   _   A
-                // Clusters: 8   6   4   2   0
-                //                  k=2 i=3
-                let ltr = ctx.dir.is_positive();
-                let first = if ltr { k } else { i };
-                let start = infos[first].cluster as usize;
-                let last = if ltr { i.checked_add(1) } else { k.checked_sub(1) };
-                let end = last
+            // Determine the text range of the glyph.
+            let start = base + cluster;
+            let end = base
+                + if ltr { i.checked_add(1) } else { i.checked_sub(1) }
                    .and_then(|last| infos.get(last))
                    .map_or(text.len(), |info| info.cluster as usize);

-                start..end
-            };
+            ctx.glyphs.push(ShapedGlyph {
+                font: font.clone(),
+                glyph_id: info.glyph_id as u16,
+                // TODO: Don't ignore y_advance.
+                x_advance: font.to_em(pos[i].x_advance),
+                x_offset: font.to_em(pos[i].x_offset),
+                y_offset: font.to_em(pos[i].y_offset),
+                range: start..end,
+                safe_to_break: !info.unsafe_to_break(),
+                c: text[cluster..].chars().next().unwrap(),
+                span: ctx.spans.span_at(start),
+            });
+        } else {
+            // First, search for the end of the tofu sequence.
+            let k = i;
+            while infos.get(i + 1).map_or(false, |info| info.glyph_id == 0) {
+                i += 1;
+            }
+
+            // Then, determine the start and end text index for the tofu
+            // sequence.
+            //
+            // Examples:
+            // Everything is shown in visual order. Tofus are written as "_".
+            // We want to find out that the tofus span the text `2..6`.
+            // Note that the clusters are longer than 1 char.
+            //
+            // Left-to-right:
+            // Text:     h a l i h a l l o
+            // Glyphs:   A   _   _   C   E
+            // Clusters: 0   2   4   6   8
+            //              k=1 i=2
+            //
+            // Right-to-left:
+            // Text:     O L L A H I L A H
+            // Glyphs:   E   C   _   _   A
+            // Clusters: 8   6   4   2   0
+            //                  k=2 i=3
+            let start = infos[if ltr { k } else { i }].cluster as usize;
+            let end = if ltr { i.checked_add(1) } else { k.checked_sub(1) }
+                .and_then(|last| infos.get(last))
+                .map_or(text.len(), |info| info.cluster as usize);

            // Trim half-baked cluster.
-            let remove = base + range.start..base + range.end;
-            while ctx.glyphs.last().map_or(false, |g| remove.contains(&g.cluster)) {
+            let remove = base + start..base + end;
+            while ctx.glyphs.last().map_or(false, |g| remove.contains(&g.range.start)) {
                ctx.glyphs.pop();
            }

            // Recursively shape the tofu sequence with the next family.
-            shape_segment(ctx, base + range.start, &text[range], families.clone());
+            shape_segment(ctx, base + start, &text[start..end], families.clone());
        }

        i += 1;
@ -634,19 +637,18 @@ fn shape_segment(
 fn shape_tofus(ctx: &mut ShapingContext, base: usize, text: &str, font: Font) {
    let x_advance = font.advance(0).unwrap_or_default();
    for (cluster, c) in text.char_indices() {
-        let cluster = base + cluster;
-        let (span, offset) = ctx.spans.span_at(ctx.base + cluster);
+        let start = base + cluster;
+        let end = start + c.len_utf8();
        ctx.glyphs.push(ShapedGlyph {
            font: font.clone(),
            glyph_id: 0,
            x_advance,
            x_offset: Em::zero(),
            y_offset: Em::zero(),
-            cluster,
+            range: start..end,
            safe_to_break: true,
            c,
-            span,
-            offset,
+            span: ctx.spans.span_at(start),
        });
    }
 }
@ -668,7 +670,10 @@ fn track_and_space(ctx: &mut ShapingContext) {
            glyph.x_advance = spacing.relative_to(glyph.x_advance);
        }

-        if glyphs.peek().map_or(false, |next| glyph.cluster != next.cluster) {
+        if glyphs
+            .peek()
+            .map_or(false, |next| glyph.range.start != next.range.start)
+        {
            glyph.x_advance += tracking;
        }
    }
--- a/src/doc.rs
+++ b/src/doc.rs
@ -1,7 +1,8 @@
 //! Finished documents.

-use std::fmt::{self, Debug, Formatter, Write};
+use std::fmt::{self, Debug, Formatter};
 use std::num::NonZeroUsize;
+use std::ops::Range;
 use std::str::FromStr;
 use std::sync::Arc;

@ -114,23 +115,6 @@ impl Frame {
    pub fn items(&self) -> std::slice::Iter<'_, (Point, FrameItem)> {
        self.items.iter()
    }
-
-    /// Approximately recover the text inside of the frame and its children.
-    pub fn text(&self) -> EcoString {
-        let mut text = EcoString::new();
-        for (_, item) in self.items() {
-            match item {
-                FrameItem::Text(item) => {
-                    for glyph in &item.glyphs {
-                        text.push(glyph.c);
-                    }
-                }
-                FrameItem::Group(group) => text.push_str(&group.frame.text()),
-                _ => {}
-            }
-        }
-        text
-    }
 }

 /// Insert items and subframes.
@ -476,6 +460,8 @@ pub struct TextItem {
    pub fill: Paint,
    /// The natural language of the text.
    pub lang: Lang,
+    /// The item's plain text.
+    pub text: EcoString,
    /// The glyphs.
    pub glyphs: Vec<Glyph>,
 }
@ -489,19 +475,14 @@ impl TextItem {

 impl Debug for TextItem {
    fn fmt(&self, f: &mut Formatter) -> fmt::Result {
-        // This is only a rough approximation of the source text.
-        f.write_str("Text(\"")?;
-        for glyph in &self.glyphs {
-            for c in glyph.c.escape_debug() {
-                f.write_char(c)?;
-            }
-        }
-        f.write_str("\")")
+        f.write_str("Text(")?;
+        self.text.fmt(f)?;
+        f.write_str(")")
    }
 }

 /// A glyph in a run of shaped text.
-#[derive(Debug, Copy, Clone, Eq, PartialEq, Hash)]
+#[derive(Debug, Clone, Eq, PartialEq, Hash)]
 pub struct Glyph {
    /// The glyph's index in the font.
    pub id: u16,
@ -509,12 +490,17 @@ pub struct Glyph {
    pub x_advance: Em,
    /// The horizontal offset of the glyph.
    pub x_offset: Em,
-    /// The first character of the glyph's cluster.
-    pub c: char,
+    /// The range of the glyph in its item's text.
+    pub range: Range<u16>,
    /// The source code location of the text.
-    pub span: Span,
-    /// The offset within the spanned text.
-    pub offset: u16,
+    pub span: (Span, u16),
+}
+
+impl Glyph {
+    /// The range of the glyph in its item's text.
+    pub fn range(&self) -> Range<usize> {
+        usize::from(self.range.start)..usize::from(self.range.end)
+    }
 }

 /// An identifier for a natural language.
--- a/src/export/pdf/font.rs
+++ b/src/export/pdf/font.rs
@ -1,13 +1,21 @@
 use std::collections::BTreeMap;

-use ecow::eco_format;
+use ecow::{eco_format, EcoString};
 use pdf_writer::types::{CidFontType, FontFlags, SystemInfo, UnicodeCmap};
 use pdf_writer::{Filter, Finish, Name, Rect, Str};
 use ttf_parser::{name_id, GlyphId, Tag};
+use unicode_general_category::GeneralCategory;

 use super::{deflate, EmExt, PdfContext, RefExt};
 use crate::util::SliceExt;

+const CMAP_NAME: Name = Name(b"Custom");
+const SYSTEM_INFO: SystemInfo = SystemInfo {
+    registry: Str(b"Adobe"),
+    ordering: Str(b"Identity"),
+    supplement: 0,
+};
+
 /// Embed all used fonts into the PDF.
 #[tracing::instrument(skip_all)]
 pub fn write_fonts(ctx: &mut PdfContext) {
@ -19,7 +27,7 @@ pub fn write_fonts(ctx: &mut PdfContext) {
        let data_ref = ctx.alloc.bump();
        ctx.font_refs.push(type0_ref);

-        let glyphs = &ctx.glyph_sets[font];
+        let glyph_set = ctx.glyph_sets.get_mut(font).unwrap();
        let metrics = font.metrics();
        let ttf = font.ttf();

@ -29,12 +37,6 @@ pub fn write_fonts(ctx: &mut PdfContext) {

        let base_font = eco_format!("ABCDEF+{}", postscript_name);
        let base_font = Name(base_font.as_bytes());
-        let cmap_name = Name(b"Custom");
-        let system_info = SystemInfo {
-            registry: Str(b"Adobe"),
-            ordering: Str(b"Identity"),
-            supplement: 0,
-        };

        // Write the base font object referencing the CID font.
        ctx.writer
@ -59,7 +61,7 @@ pub fn write_fonts(ctx: &mut PdfContext) {
        let mut cid = ctx.writer.cid_font(cid_ref);
        cid.subtype(subtype);
        cid.base_font(base_font);
-        cid.system_info(system_info);
+        cid.system_info(SYSTEM_INFO);
        cid.font_descriptor(descriptor_ref);
        cid.default_width(0.0);

@ -70,7 +72,7 @@ pub fn write_fonts(ctx: &mut PdfContext) {
        // Extract the widths of all glyphs.
        let num_glyphs = ttf.number_of_glyphs();
        let mut widths = vec![0.0; num_glyphs as usize];
-        for &g in glyphs {
+        for &g in glyph_set.keys() {
            let x = ttf.glyph_hor_advance(GlyphId(g)).unwrap_or(0);
            widths[g as usize] = font.to_em(x).to_font_units();
        }
@ -130,42 +132,15 @@ pub fn write_fonts(ctx: &mut PdfContext) {

        font_descriptor.finish();

-        // Compute a reverse mapping from glyphs to unicode.
-        let cmap = {
-            let mut mapping = BTreeMap::new();
-            for subtable in
-                ttf.tables().cmap.into_iter().flat_map(|table| table.subtables)
-            {
-                if subtable.is_unicode() {
-                    subtable.codepoints(|n| {
-                        if let Some(c) = std::char::from_u32(n) {
-                            if let Some(GlyphId(g)) = ttf.glyph_index(c) {
-                                if glyphs.contains(&g) {
-                                    mapping.insert(g, c);
-                                }
-                            }
-                        }
-                    });
-                }
-            }
-
-            let mut cmap = UnicodeCmap::new(cmap_name, system_info);
-            for (g, c) in mapping {
-                cmap.pair(g, c);
-            }
-            cmap
-        };
-
        // Write the /ToUnicode character map, which maps glyph ids back to
        // unicode codepoints to enable copying out of the PDF.
-        ctx.writer
-            .cmap(cmap_ref, &deflate(&cmap.finish()))
-            .filter(Filter::FlateDecode);
+        let cmap = create_cmap(ttf, glyph_set);
+        ctx.writer.cmap(cmap_ref, &cmap.finish());

        // Subset and write the font's bytes.
        let data = font.data();
        let subsetted = {
-            let glyphs: Vec<_> = glyphs.iter().copied().collect();
+            let glyphs: Vec<_> = glyph_set.keys().copied().collect();
            let profile = subsetter::Profile::pdf(&glyphs);
            subsetter::subset(data, font.index(), profile)
        };
@ -183,3 +158,44 @@ pub fn write_fonts(ctx: &mut PdfContext) {
        stream.finish();
    }
 }
+
+/// Create a /ToUnicode CMap.
+fn create_cmap(
+    ttf: &ttf_parser::Face,
+    glyph_set: &mut BTreeMap<u16, EcoString>,
+) -> UnicodeCmap {
+    // For glyphs that have codepoints mapping to in the font's cmap table, we
+    // prefer them over pre-existing text mappings from the document. Only
+    // things that don't have a corresponding codepoint (or only a private-use
+    // one) like the "Th" in Linux Libertine get the text of their first
+    // occurances in the document instead.
+    for subtable in ttf.tables().cmap.into_iter().flat_map(|table| table.subtables) {
+        if !subtable.is_unicode() {
+            continue;
+        }
+
+        subtable.codepoints(|n| {
+            let Some(c) = std::char::from_u32(n) else { return };
+            if unicode_general_category::get_general_category(c)
+                == GeneralCategory::PrivateUse
+            {
+                return;
+            }
+
+            let Some(GlyphId(g)) = ttf.glyph_index(c) else { return };
+            if glyph_set.contains_key(&g) {
+                glyph_set.insert(g, c.into());
+            }
+        });
+    }
+
+    // Produce a reverse mapping from glyphs to unicode strings.
+    let mut cmap = UnicodeCmap::new(CMAP_NAME, SYSTEM_INFO);
+    for (&g, text) in glyph_set.iter() {
+        if !text.is_empty() {
+            cmap.pair_with_multiple(g, text.chars());
+        }
+    }
+
+    cmap
+}
--- a/src/export/pdf/mod.rs
+++ b/src/export/pdf/mod.rs
@ -6,9 +6,10 @@ mod outline;
 mod page;

 use std::cmp::Eq;
-use std::collections::{HashMap, HashSet};
+use std::collections::{BTreeMap, HashMap};
 use std::hash::Hash;

+use ecow::EcoString;
 use pdf_writer::types::Direction;
 use pdf_writer::{Finish, Name, PdfWriter, Ref, TextStr};
 use xmp_writer::{LangId, RenditionClass, XmpWriter};
@ -52,7 +53,13 @@ pub struct PdfContext<'a> {
    page_refs: Vec<Ref>,
    font_map: Remapper<Font>,
    image_map: Remapper<Image>,
-    glyph_sets: HashMap<Font, HashSet<u16>>,
+    /// For each font a mapping from used glyphs to their text representation.
+    /// May contain multiple chars in case of ligatures or similar things. The
+    /// same glyph can have a different text representation within one document,
+    /// then we just save the first one. The resulting strings are used for the
+    /// PDF's /ToUnicode map for glyphs that don't have an entry in the font's
+    /// cmap. This is important for copy-paste and searching.
+    glyph_sets: HashMap<Font, BTreeMap<u16, EcoString>>,
    languages: HashMap<Lang, usize>,
 }

--- a/src/export/pdf/page.rs
+++ b/src/export/pdf/page.rs
@ -364,11 +364,12 @@ fn write_group(ctx: &mut PageContext, pos: Point, group: &GroupItem) {
 /// Encode a text run into the content stream.
 fn write_text(ctx: &mut PageContext, x: f32, y: f32, text: &TextItem) {
    *ctx.parent.languages.entry(text.lang).or_insert(0) += text.glyphs.len();
-    ctx.parent
-        .glyph_sets
-        .entry(text.font.clone())
-        .or_default()
-        .extend(text.glyphs.iter().map(|g| g.id));
+
+    let glyph_set = ctx.parent.glyph_sets.entry(text.font.clone()).or_default();
+    for g in &text.glyphs {
+        let segment = &text.text[g.range()];
+        glyph_set.entry(g.id).or_insert_with(|| segment.into());
+    }

    ctx.set_fill(&text.fill);
    ctx.set_font(&text.font, text.size);
--- a/src/ide/jump.rs
+++ b/src/ide/jump.rs
@ -67,7 +67,8 @@ pub fn jump_from_click(

            FrameItem::Text(text) => {
                for glyph in &text.glyphs {
-                    if glyph.span.is_detached() {
+                    let (span, span_offset) = glyph.span;
+                    if span.is_detached() {
                        continue;
                    }

@ -77,13 +78,13 @@ pub fn jump_from_click(
                        Size::new(width, text.size),
                        click,
                    ) {
-                        let source = world.source(glyph.span.source());
-                        let node = source.find(glyph.span)?;
+                        let source = world.source(span.source());
+                        let node = source.find(span)?;
                        let pos = if node.kind() == SyntaxKind::Text {
                            let range = node.range();
-                            let mut offset = range.start + usize::from(glyph.offset);
+                            let mut offset = range.start + usize::from(span_offset);
                            if (click.x - pos.x) > width / 2.0 {
-                                offset += glyph.c.len_utf8();
+                                offset += glyph.range().len();
                            }
                            offset.min(range.end)
                        } else {
@ -150,7 +151,7 @@ fn find_in_frame(frame: &Frame, span: Span) -> Option<Point> {

        if let FrameItem::Text(text) = item {
            for glyph in &text.glyphs {
-                if glyph.span == span {
+                if glyph.span.0 == span {
                    return Some(pos);
                }
                pos.x += glyph.x_advance.at(text.size);
--- a/tests/ref/text/copy-paste.png
+++ b/tests/ref/text/copy-paste.png
--- a/tests/ref/text/shaping.png
+++ b/tests/ref/text/shaping.png
--- a/tests/src/tests.rs
+++ b/tests/src/tests.rs
@ -353,9 +353,18 @@ fn test(
    pdf_path: Option<&Path>,
    args: &Args,
 ) -> bool {
-    let name = src_path.strip_prefix(TYP_DIR).unwrap_or(src_path);
+    struct PanicGuard<'a>(&'a Path);
+    impl Drop for PanicGuard<'_> {
+        fn drop(&mut self) {
+            if std::thread::panicking() {
+                println!("Panicked in {}", self.0.display());
+            }
+        }
+    }

+    let name = src_path.strip_prefix(TYP_DIR).unwrap_or(src_path);
    let text = fs::read_to_string(src_path).unwrap();
+    let _guard = PanicGuard(name);

    let mut output = String::new();
    let mut ok = true;
@ -401,6 +410,7 @@ fn test(
                line,
                &mut rng,
            );
+
            ok &= part_ok;
            compare_ever |= compare_here;
            frames.extend(part_frames);
--- a/tests/typ/text/copy-paste.typ
+++ b/tests/typ/text/copy-paste.typ
@ -0,0 +1,8 @@
+// Test copy-paste and search in PDF with ligatures
+// and Arabic test. Must be tested manually!
+
+---
+The after fira 🏳️‍🌈!
+
+#set text(lang: "ar", font: "Noto Sans Arabic")
+مرحبًا