Compress images in PDFs ⚙

2020-11-30 22:07:08 +01:00 · 2020-11-30 22:07:08 +01:00 · fdc1b378a3
commit fdc1b378a3
parent 21857064db
14 changed files with 158 additions and 62 deletions
--- a/Cargo.toml
+++ b/Cargo.toml
@ -14,6 +14,7 @@ fs = ["fontdock/fs"]
 [dependencies]
 fontdock = { path = "../fontdock", default-features = false }
 pdf-writer = { path = "../pdf-writer" }
+deflate = { version = "0.8.6" }
 image = { version = "0.23", default-features = false, features = ["jpeg", "png"] }
 itoa = "0.4"
 ttf-parser = "0.8.2"
--- a/src/env.rs
+++ b/src/env.rs
@ -5,9 +5,13 @@ use std::cell::RefCell;
 use std::collections::{hash_map::Entry, HashMap};
 use std::fmt::{self, Debug, Formatter};
 use std::fs;
+use std::io::Cursor;
 use std::path::{Path, PathBuf};
 use std::rc::Rc;

+use image::io::Reader as ImageReader;
+use image::{DynamicImage, GenericImageView, ImageFormat};
+
 use crate::font::FontLoader;

 /// A reference-counted shared environment.
@ -48,11 +52,11 @@ impl ResourceLoader {
        let id = match self.paths.entry(path.to_owned()) {
            Entry::Occupied(entry) => *entry.get(),
            Entry::Vacant(entry) => {
-                let id = *entry.insert(ResourceId(self.entries.len()));
                let data = fs::read(path).ok()?;
                let resource = parse(data)?;
+                let len = self.entries.len();
                self.entries.push(Box::new(resource));
-                id
+                *entry.insert(ResourceId(len))
            }
        };

@ -63,6 +67,7 @@ impl ResourceLoader {
    ///
    /// # Panics
    /// This panics if no resource with this id was loaded.
+    #[track_caller]
    pub fn get_loaded<R: 'static>(&self, id: ResourceId) -> &R {
        self.entries[id.0].downcast_ref().expect("bad resource type")
    }
@ -73,3 +78,32 @@ impl Debug for ResourceLoader {
        f.debug_set().entries(self.paths.keys()).finish()
    }
 }
+
+/// A loaded image resource.
+pub struct ImageResource {
+    /// The original format the image was encoded in.
+    pub format: ImageFormat,
+    /// The decoded image.
+    pub buf: DynamicImage,
+}
+
+impl ImageResource {
+    pub fn parse(data: Vec<u8>) -> Option<Self> {
+        let reader = ImageReader::new(Cursor::new(data)).with_guessed_format().ok()?;
+        let format = reader.format()?;
+        let buf = reader.decode().ok()?;
+        Some(Self { format, buf })
+    }
+}
+
+impl Debug for ImageResource {
+    fn fmt(&self, f: &mut Formatter) -> fmt::Result {
+        let (width, height) = self.buf.dimensions();
+        f.debug_struct("ImageResource")
+            .field("format", &self.format)
+            .field("color", &self.buf.color())
+            .field("width", &width)
+            .field("height", &height)
+            .finish()
+    }
+}
--- a/src/eval/mod.rs
+++ b/src/eval/mod.rs
@ -37,7 +37,7 @@ pub fn eval(tree: &SynTree, env: SharedEnv, state: State) -> Pass<Document> {
    let mut ctx = EvalContext::new(env, state);
    ctx.start_page_group(false);
    tree.eval(&mut ctx);
-    ctx.end_page_group();
+    ctx.end_page_group(true);
    ctx.finish()
 }

@ -117,7 +117,8 @@ impl EvalContext {

    /// Start a page group based on the active page state.
    ///
-    /// If `hard` is false, empty page runs will be omitted from the output.
+    /// If both this `hard` and the one in the matching call to `end_page_group`
+    /// are false, empty page runs will be omitted from the output.
    ///
    /// This also starts an inner paragraph.
    pub fn start_page_group(&mut self, hard: bool) {
@ -134,10 +135,10 @@ impl EvalContext {
    /// End a page group and push it to the finished page runs.
    ///
    /// This also ends an inner paragraph.
-    pub fn end_page_group(&mut self) {
+    pub fn end_page_group(&mut self, hard: bool) {
        self.end_par_group();
        let (group, children) = self.end_group::<PageGroup>();
-        if group.hard || !children.is_empty() {
+        if hard || group.hard || !children.is_empty() {
            self.runs.push(Pages {
                size: group.size,
                child: LayoutNode::dynamic(Pad {
@ -208,6 +209,7 @@ impl EvalContext {
    /// End a layouting group started with [`start_group`](Self::start_group).
    ///
    /// This returns the stored metadata and the collected nodes.
+    #[track_caller]
    fn end_group<T: 'static>(&mut self) -> (T, Vec<LayoutNode>) {
        if let Some(&LayoutNode::Spacing(spacing)) = self.inner.last() {
            if spacing.softness == Softness::Soft {
--- a/src/export/pdf.rs
+++ b/src/export/pdf.rs
@ -3,16 +3,19 @@
 use std::cmp::Eq;
 use std::collections::HashMap;
 use std::hash::Hash;
+use std::io::Write;

+use deflate::write::ZlibEncoder;
+use deflate::Compression;
 use fontdock::FaceId;
-use image::{DynamicImage, GenericImageView, Rgba};
+use image::{DynamicImage, GenericImageView, ImageFormat, ImageResult, Luma, Rgba};
 use pdf_writer::{
-    CidFontType, ColorSpace, Content, FontFlags, Name, PdfWriter, Rect, Ref, Str,
+    CidFontType, ColorSpace, Content, Filter, FontFlags, Name, PdfWriter, Rect, Ref, Str,
    SystemInfo, UnicodeCmap,
 };
 use ttf_parser::{name_id, GlyphId};

-use crate::env::{Env, ResourceId};
+use crate::env::{Env, ImageResource, ResourceId};
 use crate::geom::Length;
 use crate::layout::{BoxLayout, LayoutElement};

@ -50,8 +53,8 @@ impl<'a> PdfExporter<'a> {
                match element {
                    LayoutElement::Text(shaped) => fonts.insert(shaped.face),
                    LayoutElement::Image(image) => {
-                        let buf = env.resources.get_loaded::<DynamicImage>(image.res);
-                        if buf.color().has_alpha() {
+                        let img = env.resources.get_loaded::<ImageResource>(image.res);
+                        if img.buf.color().has_alpha() {
                            alpha_masks += 1;
                        }
                        images.insert(image.res);
@ -266,7 +269,7 @@ impl<'a> PdfExporter<'a> {
            // Write the to-unicode character map, which maps glyph ids back to
            // unicode codepoints to enable copying out of the PDF.
            self.writer
-                .cmap_stream(refs.cmap, &{
+                .cmap(refs.cmap, &{
                    let mut cmap = UnicodeCmap::new(cmap_name, system_info);
                    for subtable in face.character_mapping_subtables() {
                        subtable.codepoints(|n| {
@ -288,39 +291,49 @@ impl<'a> PdfExporter<'a> {
    }

    fn write_images(&mut self) {
-        let mut mask = 0;
+        let mut masks_seen = 0;

        for (id, resource) in self.refs.images().zip(self.images.layout_indices()) {
-            let buf = self.env.resources.get_loaded::<DynamicImage>(resource);
-            let data = buf.to_rgb8().into_raw();
+            let img = self.env.resources.get_loaded::<ImageResource>(resource);
+            let (width, height) = img.buf.dimensions();

-            let mut image = self.writer.image_stream(id, &data);
-            image.width(buf.width() as i32);
-            image.height(buf.height() as i32);
-            image.color_space(ColorSpace::DeviceRGB);
-            image.bits_per_component(8);
+            // Add the primary image.
+            if let Ok((data, filter, color_space)) = encode_image(img) {
+                let mut image = self.writer.image(id, &data);
+                image.inner().filter(filter);
+                image.width(width as i32);
+                image.height(height as i32);
+                image.color_space(color_space);
+                image.bits_per_component(8);

-            // Add a second gray-scale image containing the alpha values if this
-            // is image has an alpha channel.
-            if buf.color().has_alpha() {
-                let mask_id = self.refs.alpha_mask(mask);
+                // Add a second gray-scale image containing the alpha values if
+                // this image has an alpha channel.
+                if img.buf.color().has_alpha() {
+                    if let Ok((alpha_data, alpha_filter)) = encode_alpha(img) {
+                        let mask_id = self.refs.alpha_mask(masks_seen);
+                        image.s_mask(mask_id);
+                        drop(image);

-                image.s_mask(mask_id);
-                drop(image);
+                        let mut mask = self.writer.image(mask_id, &alpha_data);
+                        mask.inner().filter(alpha_filter);
+                        mask.width(width as i32);
+                        mask.height(height as i32);
+                        mask.color_space(ColorSpace::DeviceGray);
+                        mask.bits_per_component(8);
+                    } else {
+                        // TODO: Warn that alpha channel could not be encoded.
+                    }

-                let mut samples = vec![];
-                for (_, _, Rgba([_, _, _, a])) in buf.pixels() {
-                    samples.push(a);
+                    masks_seen += 1;
                }
-
+            } else {
+                // TODO: Warn that image could not be encoded.
                self.writer
-                    .image_stream(mask_id, &samples)
-                    .width(buf.width() as i32)
-                    .height(buf.height() as i32)
+                    .image(id, &[])
+                    .width(0)
+                    .height(0)
                    .color_space(ColorSpace::DeviceGray)
-                    .bits_per_component(8);
-
-                mask += 1;
+                    .bits_per_component(1);
            }
        }
    }
@ -446,3 +459,57 @@ where
        self.to_layout.iter().copied()
    }
 }
+
+/// Encode an image with a suitable filter.
+///
+/// Skips the alpha channel as that's encoded separately.
+fn encode_image(img: &ImageResource) -> ImageResult<(Vec<u8>, Filter, ColorSpace)> {
+    let mut data = vec![];
+    let (filter, space) = match (img.format, &img.buf) {
+        // 8-bit gray JPEG.
+        (ImageFormat::Jpeg, DynamicImage::ImageLuma8(_)) => {
+            img.buf.write_to(&mut data, img.format)?;
+            (Filter::DctDecode, ColorSpace::DeviceGray)
+        }
+
+        // 8-bit Rgb JPEG (Cmyk JPEGs get converted to Rgb earlier).
+        (ImageFormat::Jpeg, DynamicImage::ImageRgb8(_)) => {
+            img.buf.write_to(&mut data, img.format)?;
+            (Filter::DctDecode, ColorSpace::DeviceRgb)
+        }
+
+        // TODO: Encode flate streams with PNG-predictor?
+
+        // 8-bit gray PNG.
+        (ImageFormat::Png, DynamicImage::ImageLuma8(luma)) => {
+            let mut enc = ZlibEncoder::new(&mut data, Compression::default());
+            for &Luma([value]) in luma.pixels() {
+                enc.write_all(&[value])?;
+            }
+            enc.finish()?;
+            (Filter::FlateDecode, ColorSpace::DeviceGray)
+        }
+
+        // Anything else (including Rgb(a) PNGs).
+        (_, buf) => {
+            let mut enc = ZlibEncoder::new(&mut data, Compression::default());
+            for (_, _, Rgba([r, g, b, _])) in buf.pixels() {
+                enc.write_all(&[r, g, b])?;
+            }
+            enc.finish()?;
+            (Filter::FlateDecode, ColorSpace::DeviceRgb)
+        }
+    };
+    Ok((data, filter, space))
+}
+
+/// Encode an image's alpha channel if present.
+fn encode_alpha(img: &ImageResource) -> ImageResult<(Vec<u8>, Filter)> {
+    let mut data = vec![];
+    let mut enc = ZlibEncoder::new(&mut data, Compression::default());
+    for (_, _, Rgba([_, _, _, a])) in img.buf.pixels() {
+        enc.write_all(&[a])?;
+    }
+    enc.finish()?;
+    Ok((data, Filter::FlateDecode))
+}
--- a/src/library/insert.rs
+++ b/src/library/insert.rs
@ -1,9 +1,6 @@
-use std::io::Cursor;
-
-use image::io::Reader;
 use image::GenericImageView;

-use crate::env::ResourceId;
+use crate::env::{ImageResource, ResourceId};
 use crate::layout::*;
 use crate::prelude::*;

@ -20,15 +17,10 @@ pub fn image(mut args: Args, ctx: &mut EvalContext) -> Value {

    if let Some(path) = path {
        let mut env = ctx.env.borrow_mut();
-        let loaded = env.resources.load(path.v, |data| {
-            Reader::new(Cursor::new(data))
-                .with_guessed_format()
-                .ok()
-                .and_then(|reader| reader.decode().ok())
-        });
+        let loaded = env.resources.load(path.v, ImageResource::parse);

-        if let Some((res, buf)) = loaded {
-            let dimensions = buf.dimensions();
+        if let Some((res, img)) = loaded {
+            let dimensions = img.buf.dimensions();
            drop(env);
            ctx.push(Image {
                res,
--- a/src/library/layout.rs
+++ b/src/library/layout.rs
@ -316,13 +316,13 @@ pub fn page(mut args: Args, ctx: &mut EvalContext) -> Value {
    args.done(ctx);

    if let Some(body) = body {
-        ctx.end_page_group();
+        ctx.end_page_group(false);
        ctx.start_page_group(true);
        body.eval(ctx);
        ctx.state = snapshot;
    }

-    ctx.end_page_group();
+    ctx.end_page_group(false);
    ctx.start_page_group(false);

    Value::None
@ -331,7 +331,7 @@ pub fn page(mut args: Args, ctx: &mut EvalContext) -> Value {
 /// `pagebreak`: Start a new page.
 pub fn pagebreak(args: Args, ctx: &mut EvalContext) -> Value {
    args.done(ctx);
-    ctx.end_page_group();
+    ctx.end_page_group(false);
    ctx.start_page_group(true);
    Value::None
 }
--- a/tests/README.md
+++ b/tests/README.md
@ -3,6 +3,6 @@
 - `typ`: Input files
 - `pdf`: PDF files produced by tests
 - `png`: PNG files produced by tests
- `ref`: Reference images which the PNGs are compared to byte-wise to determine
+- `cmp`: Reference images which the PNGs are compared to byte-wise to determine
         whether the test passed or failed
 - `res`: Resource files used by tests
--- a/tests/cmp/coma.png
+++ b/tests/cmp/coma.png
--- a/tests/cmp/image.png
+++ b/tests/cmp/image.png
--- a/tests/ref/image.png
+++ b/tests/ref/image.png
--- a/tests/res/rhino.png
+++ b/tests/res/rhino.png
--- a/tests/res/tiger-alpha.png
+++ b/tests/res/tiger-alpha.png
--- a/tests/typ/image.typ
+++ b/tests/typ/image.typ
@ -6,8 +6,8 @@

 # Tiger
 [image: "res/tiger.jpg", width=2cm]
-[image: "res/tiger-alpha.png", width=1cm]
-[image: "res/tiger-alpha.png", height=2cm]
+[image: "res/rhino.png", width=1cm]
+[image: "res/rhino.png", height=2cm]

 [pagebreak]

--- a/tests/typeset.rs
+++ b/tests/typeset.rs
@ -6,7 +6,7 @@ use std::path::Path;
 use std::rc::Rc;

 use fontdock::fs::{FsIndex, FsSource};
-use image::{DynamicImage, GenericImageView, Rgba};
+use image::{GenericImageView, Rgba};
 use memmap::Mmap;
 use tiny_skia::{
    Canvas, Color, ColorU8, FillRule, FilterQuality, Paint, PathBuilder, Pattern, Pixmap,
@ -15,7 +15,7 @@ use tiny_skia::{
 use ttf_parser::OutlineBuilder;

 use typst::diag::{Feedback, Pass};
-use typst::env::{Env, ResourceLoader, SharedEnv};
+use typst::env::{Env, ImageResource, ResourceLoader, SharedEnv};
 use typst::eval::State;
 use typst::export::pdf;
 use typst::font::FontLoader;
@ -29,7 +29,7 @@ const FONT_DIR: &str = "../fonts";
 const TYP_DIR: &str = "typ";
 const PDF_DIR: &str = "pdf";
 const PNG_DIR: &str = "png";
-const REF_DIR: &str = "ref";
+const CMP_DIR: &str = "cmp";

 fn main() {
    env::set_current_dir(env::current_dir().unwrap().join("tests")).unwrap();
@ -46,7 +46,7 @@ fn main() {
        let name = src_path.file_stem().unwrap().to_string_lossy().to_string();
        let pdf_path = Path::new(PDF_DIR).join(&name).with_extension("pdf");
        let png_path = Path::new(PNG_DIR).join(&name).with_extension("png");
-        let ref_path = Path::new(REF_DIR).join(&name).with_extension("png");
+        let ref_path = Path::new(CMP_DIR).join(&name).with_extension("png");

        if filter.matches(&name) {
            filtered.push((name, src_path, pdf_path, png_path, ref_path));
@ -247,8 +247,8 @@ fn draw_text(canvas: &mut Canvas, pos: Point, env: &Env, shaped: &Shaped) {
    }
 }

-fn draw_image(canvas: &mut Canvas, pos: Point, env: &Env, image: &ImageElement) {
-    let buf = env.resources.get_loaded::<DynamicImage>(image.res);
+fn draw_image(canvas: &mut Canvas, pos: Point, env: &Env, img: &ImageElement) {
+    let buf = &env.resources.get_loaded::<ImageResource>(img.res).buf;

    let mut pixmap = Pixmap::new(buf.width(), buf.height()).unwrap();
    for ((_, _, src), dest) in buf.pixels().zip(pixmap.pixels_mut()) {
@ -256,8 +256,8 @@ fn draw_image(canvas: &mut Canvas, pos: Point, env: &Env, image: &ImageElement)
        *dest = ColorU8::from_rgba(r, g, b, a).premultiply();
    }

-    let view_width = image.size.width.to_pt() as f32;
-    let view_height = image.size.height.to_pt() as f32;
+    let view_width = img.size.width.to_pt() as f32;
+    let view_height = img.size.height.to_pt() as f32;

    let x = pos.x.to_pt() as f32;
    let y = pos.y.to_pt() as f32;