Implement CSV file header rows support. (#2619)

Co-authored-by: Laurenz <laurmaedje@gmail.com>
2023-12-18 17:14:21 +01:00 · 2023-12-18 17:14:21 +01:00 · 1f983ced90
commit 1f983ced90
parent e8e797c18b
2 changed files with 96 additions and 12 deletions
--- a/crates/typst/src/loading/csv.rs
+++ b/crates/typst/src/loading/csv.rs
@ -2,7 +2,7 @@ use ecow::{eco_format, EcoString};

 use crate::diag::{bail, At, SourceResult};
 use crate::engine::Engine;
-use crate::foundations::{cast, func, scope, Array, IntoValue, Value};
+use crate::foundations::{cast, func, scope, Array, Dict, IntoValue, Type, Value};
 use crate::loading::Readable;
 use crate::syntax::Spanned;
 use crate::World;
@ -35,11 +35,21 @@ pub fn csv(
    #[named]
    #[default]
    delimiter: Delimiter,
+    /// How to represent the file's rows.
+    ///
+    /// - If set to `array`, each row is represented as a plain array of
+    ///   strings.
+    /// - If set to `dictionary`, each row is represented as a dictionary
+    ///   mapping from header keys to strings. This option only makes sense when
+    ///   a header row is present in the CSV file.
+    #[named]
+    #[default(RowType::Array)]
+    row_type: RowType,
 ) -> SourceResult<Array> {
    let Spanned { v: path, span } = path;
    let id = span.resolve_path(&path).at(span)?;
    let data = engine.world.file(id).at(span)?;
-    self::csv::decode(Spanned::new(Readable::Bytes(data), span), delimiter)
+    self::csv::decode(Spanned::new(Readable::Bytes(data), span), delimiter, row_type)
 }

 #[scope]
@ -54,22 +64,59 @@ impl csv {
        #[named]
        #[default]
        delimiter: Delimiter,
+        /// How to represent the file's rows.
+        ///
+        /// - If set to `array`, each row is represented as a plain array of
+        ///   strings.
+        /// - If set to `dictionary`, each row is represented as a dictionary
+        ///   mapping from header keys to strings. This option only makes sense
+        ///   when a header row is present in the CSV file.
+        #[named]
+        #[default(RowType::Array)]
+        row_type: RowType,
    ) -> SourceResult<Array> {
        let Spanned { v: data, span } = data;
-        let mut builder = ::csv::ReaderBuilder::new();
-        builder.has_headers(false);
-        builder.delimiter(delimiter.0 as u8);
-        let mut reader = builder.from_reader(data.as_slice());
-        let mut array = Array::new();
+        let has_headers = row_type == RowType::Dict;

+        let mut builder = ::csv::ReaderBuilder::new();
+        builder.has_headers(has_headers);
+        builder.delimiter(delimiter.0 as u8);
+
+        // Counting lines from 1 by default.
+        let mut line_offset: usize = 1;
+        let mut reader = builder.from_reader(data.as_slice());
+        let mut headers: Option<::csv::StringRecord> = None;
+
+        if has_headers {
+            // Counting lines from 2 because we have a header.
+            line_offset += 1;
+            headers = Some(
+                reader
+                    .headers()
+                    .map_err(|err| format_csv_error(err, 1))
+                    .at(span)?
+                    .clone(),
+            );
+        }
+
+        let mut array = Array::new();
        for (line, result) in reader.records().enumerate() {
-            // Original solution use line from error, but that is incorrect with
-            // `has_headers` set to `false`. See issue:
+            // Original solution was to use line from error, but that is
+            // incorrect with `has_headers` set to `false`. See issue:
            // https://github.com/BurntSushi/rust-csv/issues/184
-            let line = line + 1; // Counting lines from 1
+            let line = line + line_offset;
            let row = result.map_err(|err| format_csv_error(err, line)).at(span)?;
-            let sub = row.into_iter().map(|field| field.into_value()).collect();
-            array.push(Value::Array(sub))
+            let item = if let Some(headers) = &headers {
+                let mut dict = Dict::new();
+                for (field, value) in headers.iter().zip(&row) {
+                    dict.insert(field.into(), value.into_value());
+                }
+                dict.into_value()
+            } else {
+                let sub = row.into_iter().map(|field| field.into_value()).collect();
+                Value::Array(sub)
+            };
+            array.push(item);
        }

        Ok(array)
@ -103,6 +150,30 @@ cast! {
    },
 }

+/// The type of parsed rows.
+#[derive(Debug, Copy, Clone, Eq, PartialEq, Hash)]
+pub enum RowType {
+    Array,
+    Dict,
+}
+
+cast! {
+    RowType,
+    self => match self {
+        Self::Array => Type::of::<Array>(),
+        Self::Dict => Type::of::<Dict>(),
+    }.into_value(),
+    ty: Type => {
+        if ty == Type::of::<Array>() {
+            Self::Array
+        } else if ty == Type::of::<Dict>() {
+            Self::Dict
+        } else {
+            bail!("expected `array` or `dictionary`");
+        }
+    },
+}
+
 /// Format the user-facing CSV error message.
 fn format_csv_error(err: ::csv::Error, line: usize) -> EcoString {
    match err.kind() {
--- a/tests/typ/compute/data.typ
+++ b/tests/typ/compute/data.typ
@ -22,6 +22,14 @@
 #let cells = data.at(0).map(strong) + data.slice(1).flatten()
 #table(columns: data.at(0).len(), ..cells)

+---
+// Test reading CSV data with dictionary rows enabled.
+#let data = csv("/files/zoo.csv", row-type: dictionary)
+#test(data.len(), 3)
+#test(data.at(0).Name, "Debby")
+#test(data.at(2).Weight, "150kg")
+#test(data.at(1).Species, "Tiger")
+
 ---
 // Error: 6-16 file not found (searched at typ/compute/nope.csv)
 #csv("nope.csv")
@ -30,6 +38,11 @@
 // Error: 6-22 failed to parse CSV (found 3 instead of 2 fields in line 3)
 #csv("/files/bad.csv")

+---
+// Test error numbering with dictionary rows.
+// Error: 6-22 failed to parse CSV (found 3 instead of 2 fields in line 3)
+#csv("/files/bad.csv", row-type: dictionary)
+
 ---
 // Test reading JSON data.
 #let data = json("/files/zoo.json")