sheetkit_core/workbook/
mod.rs

1//! Workbook file I/O: reading and writing `.xlsx` files.
2//!
3//! An `.xlsx` file is a ZIP archive containing XML parts. This module provides
4//! [`Workbook`] which holds the parsed XML structures in memory and can
5//! serialize them back to a valid `.xlsx` file.
6
7use std::collections::{HashMap, HashSet};
8use std::io::{Read as _, Write as _};
9use std::path::Path;
10use std::sync::OnceLock;
11
12use serde::Serialize;
13use sheetkit_xml::chart::ChartSpace;
14use sheetkit_xml::comments::Comments;
15use sheetkit_xml::content_types::{
16    mime_types, ContentTypeDefault, ContentTypeOverride, ContentTypes,
17};
18
19/// The OOXML package format, determined by the workbook content type in
20/// `[Content_Types].xml`. Controls which content type string is emitted for
21/// `xl/workbook.xml` on save.
22#[derive(Debug, Default, Clone, Copy, PartialEq, Eq)]
23pub enum WorkbookFormat {
24    /// Standard spreadsheet (.xlsx).
25    #[default]
26    Xlsx,
27    /// Macro-enabled spreadsheet (.xlsm).
28    Xlsm,
29    /// Template (.xltx).
30    Xltx,
31    /// Macro-enabled template (.xltm).
32    Xltm,
33    /// Macro-enabled add-in (.xlam).
34    Xlam,
35}
36
37impl WorkbookFormat {
38    /// Infer the format from a workbook content type string found in
39    /// `[Content_Types].xml`.
40    pub fn from_content_type(ct: &str) -> Option<Self> {
41        match ct {
42            mime_types::WORKBOOK => Some(Self::Xlsx),
43            mime_types::WORKBOOK_MACRO => Some(Self::Xlsm),
44            mime_types::WORKBOOK_TEMPLATE => Some(Self::Xltx),
45            mime_types::WORKBOOK_TEMPLATE_MACRO => Some(Self::Xltm),
46            mime_types::WORKBOOK_ADDIN_MACRO => Some(Self::Xlam),
47            _ => None,
48        }
49    }
50
51    /// Infer the format from a file extension (case-insensitive, without the
52    /// leading dot). Returns `None` for unrecognized extensions.
53    pub fn from_extension(ext: &str) -> Option<Self> {
54        match ext.to_ascii_lowercase().as_str() {
55            "xlsx" => Some(Self::Xlsx),
56            "xlsm" => Some(Self::Xlsm),
57            "xltx" => Some(Self::Xltx),
58            "xltm" => Some(Self::Xltm),
59            "xlam" => Some(Self::Xlam),
60            _ => None,
61        }
62    }
63
64    /// Return the OOXML content type string for this format.
65    pub fn content_type(self) -> &'static str {
66        match self {
67            Self::Xlsx => mime_types::WORKBOOK,
68            Self::Xlsm => mime_types::WORKBOOK_MACRO,
69            Self::Xltx => mime_types::WORKBOOK_TEMPLATE,
70            Self::Xltm => mime_types::WORKBOOK_TEMPLATE_MACRO,
71            Self::Xlam => mime_types::WORKBOOK_ADDIN_MACRO,
72        }
73    }
74}
75
76use sheetkit_xml::drawing::{MarkerType, WsDr};
77use sheetkit_xml::relationships::{self, rel_types, Relationship, Relationships};
78use sheetkit_xml::shared_strings::Sst;
79use sheetkit_xml::styles::StyleSheet;
80use sheetkit_xml::workbook::{WorkbookProtection, WorkbookXml};
81use sheetkit_xml::worksheet::{Cell, CellFormula, CellTypeTag, DrawingRef, Row, WorksheetXml};
82use zip::write::SimpleFileOptions;
83use zip::CompressionMethod;
84
85use crate::cell::CellValue;
86use crate::cell_ref_shift::shift_cell_references_in_text;
87use crate::chart::ChartConfig;
88use crate::comment::CommentConfig;
89use crate::conditional::ConditionalFormatRule;
90use crate::error::{Error, Result};
91use crate::image::ImageConfig;
92use crate::pivot::{PivotTableConfig, PivotTableInfo};
93use crate::protection::WorkbookProtectionConfig;
94use crate::sst::SharedStringTable;
95use crate::threaded_comment::{PersonData, PersonInput, ThreadedCommentData, ThreadedCommentInput};
96use crate::utils::cell_ref::{cell_name_to_coordinates, column_name_to_number};
97use crate::utils::constants::MAX_CELL_CHARS;
98use crate::validation::DataValidationConfig;
99use crate::workbook_paths::{
100    default_relationships, relationship_part_path, relative_relationship_target,
101    resolve_relationship_target,
102};
103
104#[path = "aux_parts.rs"]
105pub(crate) mod aux;
106mod cell_ops;
107mod data;
108mod drawing;
109mod features;
110mod io;
111mod open_options;
112mod sheet_ops;
113mod source;
114
115pub use open_options::{AuxParts, OpenOptions, ReadMode};
116pub(crate) use source::PackageSource;
117
118/// Helper to initialize an `OnceLock<WorksheetXml>` with a value at
119/// construction time. Avoids repeating the `set`+`unwrap` pattern.
120pub(crate) fn initialized_lock(ws: WorksheetXml) -> OnceLock<WorksheetXml> {
121    let lock = OnceLock::new();
122    let _ = lock.set(ws);
123    lock
124}
125
126/// XML declaration prepended to every XML part in the package.
127const XML_DECLARATION: &str = r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>"#;
128
129/// In-memory representation of an `.xlsx` workbook.
130pub struct Workbook {
131    format: WorkbookFormat,
132    content_types: ContentTypes,
133    package_rels: Relationships,
134    workbook_xml: WorkbookXml,
135    workbook_rels: Relationships,
136    /// Per-sheet worksheet XML, stored as `(name, OnceLock<WorksheetXml>)`.
137    /// When a sheet is eagerly parsed, the `OnceLock` is initialized at open
138    /// time. When a sheet is deferred (lazy mode or filtered out), the lock
139    /// is empty and `raw_sheet_xml[i]` holds the raw bytes; the first call
140    /// to [`worksheet_ref`] or [`worksheet_mut`] hydrates the lock on demand.
141    worksheets: Vec<(String, OnceLock<WorksheetXml>)>,
142    stylesheet: StyleSheet,
143    sst_runtime: SharedStringTable,
144    /// Per-sheet comments, parallel to the `worksheets` vector.
145    sheet_comments: Vec<Option<Comments>>,
146    /// Chart parts: (zip path like "xl/charts/chart1.xml", ChartSpace data).
147    charts: Vec<(String, ChartSpace)>,
148    /// Chart parts preserved as raw XML when typed parsing is not supported.
149    raw_charts: Vec<(String, Vec<u8>)>,
150    /// Drawing parts: (zip path like "xl/drawings/drawing1.xml", WsDr data).
151    drawings: Vec<(String, WsDr)>,
152    /// Image parts: (zip path like "xl/media/image1.png", raw bytes).
153    images: Vec<(String, Vec<u8>)>,
154    /// Maps sheet index -> drawing index in `drawings`.
155    #[allow(dead_code)]
156    worksheet_drawings: HashMap<usize, usize>,
157    /// Per-sheet worksheet relationship files.
158    worksheet_rels: HashMap<usize, Relationships>,
159    /// Per-drawing relationship files: drawing_index -> Relationships.
160    drawing_rels: HashMap<usize, Relationships>,
161    /// Core document properties (docProps/core.xml).
162    core_properties: Option<sheetkit_xml::doc_props::CoreProperties>,
163    /// Extended/application properties (docProps/app.xml).
164    app_properties: Option<sheetkit_xml::doc_props::ExtendedProperties>,
165    /// Custom properties (docProps/custom.xml).
166    custom_properties: Option<sheetkit_xml::doc_props::CustomProperties>,
167    /// Pivot table parts: (zip path, PivotTableDefinition data).
168    pivot_tables: Vec<(String, sheetkit_xml::pivot_table::PivotTableDefinition)>,
169    /// Pivot cache definition parts: (zip path, PivotCacheDefinition data).
170    pivot_cache_defs: Vec<(String, sheetkit_xml::pivot_cache::PivotCacheDefinition)>,
171    /// Pivot cache records parts: (zip path, PivotCacheRecords data).
172    pivot_cache_records: Vec<(String, sheetkit_xml::pivot_cache::PivotCacheRecords)>,
173    /// Raw theme XML bytes from xl/theme/theme1.xml (preserved for round-trip).
174    theme_xml: Option<Vec<u8>>,
175    /// Parsed theme colors from the theme XML.
176    theme_colors: sheetkit_xml::theme::ThemeColors,
177    /// Per-sheet sparkline configurations, parallel to the `worksheets` vector.
178    sheet_sparklines: Vec<Vec<crate::sparkline::SparklineConfig>>,
179    /// Per-sheet VML drawing bytes (for legacy comment rendering), parallel to `worksheets`.
180    /// `None` means no VML part exists for that sheet.
181    sheet_vml: Vec<Option<Vec<u8>>>,
182    /// ZIP entries not recognized by the parser, preserved for round-trip fidelity.
183    /// Each entry is (zip_path, raw_bytes).
184    unknown_parts: Vec<(String, Vec<u8>)>,
185    /// Typed index of auxiliary parts deferred during Lazy/Stream open.
186    /// Stores raw bytes grouped by category (comments, charts, doc props, etc.)
187    /// and supports on-demand hydration with dirty tracking.
188    deferred_parts: aux::DeferredAuxParts,
189    /// Raw VBA project binary blob (`xl/vbaProject.bin`), preserved for round-trip
190    /// and used for VBA module extraction. `None` for non-macro workbooks.
191    vba_blob: Option<Vec<u8>>,
192    /// Table parts: (zip path like "xl/tables/table1.xml", TableXml data, sheet_index).
193    tables: Vec<(String, sheetkit_xml::table::TableXml, usize)>,
194    /// Raw XML bytes for sheets that were not parsed during open.
195    /// Parallel to `worksheets`. `Some(bytes)` means the sheet XML has not
196    /// been deserialized: either filtered out by the `sheets` option, or
197    /// deferred in Lazy/Stream mode. The bytes are written directly on save
198    /// if the corresponding `OnceLock` in `worksheets` was never initialized.
199    raw_sheet_xml: Vec<Option<Vec<u8>>>,
200    /// Per-sheet dirty flag, parallel to `worksheets`. A sheet is marked
201    /// dirty when it is mutated (via `worksheet_mut`, `set_cell_value`, etc.).
202    /// Clean sheets with available raw bytes are written via passthrough on
203    /// save, avoiding serialization overhead.
204    sheet_dirty: Vec<bool>,
205    /// Slicer definition parts: (zip path, SlicerDefinitions data).
206    slicer_defs: Vec<(String, sheetkit_xml::slicer::SlicerDefinitions)>,
207    /// Slicer cache definition parts: (zip path, raw XML string).
208    slicer_caches: Vec<(String, sheetkit_xml::slicer::SlicerCacheDefinition)>,
209    /// Per-sheet threaded comments (Excel 2019+), parallel to the `worksheets` vector.
210    sheet_threaded_comments: Vec<Option<sheetkit_xml::threaded_comment::ThreadedComments>>,
211    /// Person list shared across all sheets (for threaded comment authors).
212    person_list: sheetkit_xml::threaded_comment::PersonList,
213    /// Per-sheet form control configurations, parallel to `worksheets`.
214    sheet_form_controls: Vec<Vec<crate::control::FormControlConfig>>,
215    /// O(1) sheet name -> index lookup cache. Must be kept in sync with
216    /// `worksheets` via [`rebuild_sheet_index`].
217    sheet_name_index: HashMap<String, usize>,
218    /// Streamed sheet data keyed by sheet index. During save, these sheets
219    /// are written by streaming from their temp files instead of serializing
220    /// the (empty placeholder) WorksheetXml.
221    streamed_sheets: HashMap<usize, crate::stream::StreamedSheetData>,
222    /// Backing storage for the xlsx package, retained for lazy part access.
223    #[allow(dead_code)]
224    package_source: Option<PackageSource>,
225    /// Read mode used when this workbook was opened.
226    read_mode: ReadMode,
227    /// Optional row limit from `OpenOptions::sheet_rows`, applied during
228    /// on-demand hydration of deferred sheets.
229    sheet_rows_limit: Option<u32>,
230}
231
232impl Workbook {
233    /// Return the detected or assigned workbook format.
234    pub fn format(&self) -> WorkbookFormat {
235        self.format
236    }
237
238    /// Set the workbook format. This determines the content type written for
239    /// `xl/workbook.xml` on save.
240    pub fn set_format(&mut self, format: WorkbookFormat) {
241        self.format = format;
242    }
243
244    /// Get the 0-based index of a sheet by name. O(1) via HashMap.
245    pub(crate) fn sheet_index(&self, sheet: &str) -> Result<usize> {
246        self.sheet_name_index
247            .get(sheet)
248            .copied()
249            .ok_or_else(|| Error::SheetNotFound {
250                name: sheet.to_string(),
251            })
252    }
253
254    /// Invalidate streamed data for a sheet by index. Must be called before
255    /// any mutation to a sheet that may have been created via StreamWriter,
256    /// so that the normal WorksheetXml serialization path is used on save.
257    pub(crate) fn invalidate_streamed(&mut self, idx: usize) {
258        self.streamed_sheets.remove(&idx);
259    }
260
261    /// Mark a sheet as dirty (modified). Dirty sheets are always serialized
262    /// on save, even if raw bytes exist. Clean sheets can use raw-byte
263    /// passthrough for zero-cost round-trip.
264    pub(crate) fn mark_sheet_dirty(&mut self, idx: usize) {
265        if idx < self.sheet_dirty.len() {
266            self.sheet_dirty[idx] = true;
267        }
268    }
269
270    /// Check whether a sheet has been marked dirty since opening.
271    #[cfg(test)]
272    pub(crate) fn is_sheet_dirty(&self, idx: usize) -> bool {
273        self.sheet_dirty.get(idx).copied().unwrap_or(false)
274    }
275
276    /// Get a mutable reference to the worksheet XML for the named sheet.
277    ///
278    /// If the sheet has streamed data (from [`apply_stream_writer`]), the
279    /// streamed entry is removed so that subsequent edits are not silently
280    /// ignored on save. Deferred sheets are hydrated on demand.
281    pub(crate) fn worksheet_mut(&mut self, sheet: &str) -> Result<&mut WorksheetXml> {
282        let idx = self.sheet_index(sheet)?;
283        self.invalidate_streamed(idx);
284        self.ensure_hydrated(idx)?;
285        self.mark_sheet_dirty(idx);
286        Ok(self.worksheets[idx].1.get_mut().unwrap())
287    }
288
289    /// Get an immutable reference to the worksheet XML for the named sheet.
290    /// Deferred sheets are hydrated lazily via `OnceLock`.
291    pub(crate) fn worksheet_ref(&self, sheet: &str) -> Result<&WorksheetXml> {
292        let idx = self.sheet_index(sheet)?;
293        self.worksheet_ref_by_index(idx)
294    }
295
296    /// Get an immutable reference to the worksheet XML by index.
297    /// Deferred sheets are hydrated lazily via `OnceLock`.
298    pub(crate) fn worksheet_ref_by_index(&self, idx: usize) -> Result<&WorksheetXml> {
299        if let Some(ws) = self.worksheets[idx].1.get() {
300            return Ok(ws);
301        }
302        // Hydrate from raw_sheet_xml on first access.
303        if let Some(Some(bytes)) = self.raw_sheet_xml.get(idx) {
304            let mut ws = io::deserialize_worksheet_xml(bytes)?;
305            if let Some(max_rows) = self.sheet_rows_limit {
306                ws.sheet_data.rows.truncate(max_rows as usize);
307            }
308            Ok(self.worksheets[idx].1.get_or_init(|| ws))
309        } else {
310            Err(Error::Internal(format!(
311                "sheet at index {} has no materialized or deferred data",
312                idx
313            )))
314        }
315    }
316
317    /// Public immutable reference to a worksheet's XML by sheet name.
318    /// Deferred sheets are hydrated lazily on first access.
319    pub fn worksheet_xml_ref(&self, sheet: &str) -> Result<&WorksheetXml> {
320        self.worksheet_ref(sheet)
321    }
322
323    /// Public immutable reference to the shared string table.
324    pub fn sst_ref(&self) -> &SharedStringTable {
325        &self.sst_runtime
326    }
327
328    /// Rebuild the sheet name -> index lookup after any structural change
329    /// to the worksheets vector.
330    pub(crate) fn rebuild_sheet_index(&mut self) {
331        self.sheet_name_index.clear();
332        for (i, (name, _ws_lock)) in self.worksheets.iter().enumerate() {
333            self.sheet_name_index.insert(name.clone(), i);
334        }
335    }
336
337    /// Ensure the sheet at the given index is hydrated (parsed from raw XML).
338    /// This is used by `&mut self` methods that need a mutable `OnceLock`
339    /// reference via `get_mut()`, which requires the lock to be initialized.
340    fn ensure_hydrated(&mut self, idx: usize) -> Result<()> {
341        if self.worksheets[idx].1.get().is_some() {
342            // OnceLock is set. If raw bytes are still present, this is a
343            // placeholder (filtered-out sheet with WorksheetXml::default()).
344            // Replace the placeholder with properly parsed data.
345            if let Some(Some(bytes)) = self.raw_sheet_xml.get(idx) {
346                let mut ws = io::deserialize_worksheet_xml(bytes)?;
347                if let Some(max_rows) = self.sheet_rows_limit {
348                    ws.sheet_data.rows.truncate(max_rows as usize);
349                }
350                *self.worksheets[idx].1.get_mut().unwrap() = ws;
351                self.raw_sheet_xml[idx] = None;
352            }
353            return Ok(());
354        }
355        if let Some(Some(bytes)) = self.raw_sheet_xml.get(idx) {
356            let mut ws = io::deserialize_worksheet_xml(bytes)?;
357            if let Some(max_rows) = self.sheet_rows_limit {
358                ws.sheet_data.rows.truncate(max_rows as usize);
359            }
360            let _ = self.worksheets[idx].1.set(ws);
361            self.raw_sheet_xml[idx] = None;
362            Ok(())
363        } else {
364            Err(Error::Internal(format!(
365                "sheet at index {} has no materialized or deferred data",
366                idx
367            )))
368        }
369    }
370
371    /// Hydrate if needed and return a mutable reference to the worksheet
372    /// at the given index. Callers must hold `&mut self`.
373    pub(crate) fn worksheet_mut_by_index(&mut self, idx: usize) -> Result<&mut WorksheetXml> {
374        self.ensure_hydrated(idx)?;
375        self.mark_sheet_dirty(idx);
376        Ok(self.worksheets[idx].1.get_mut().unwrap())
377    }
378
379    /// Resolve the part path for a sheet index from workbook relationships.
380    /// Falls back to the default `xl/worksheets/sheet{N}.xml` naming.
381    pub(crate) fn sheet_part_path(&self, sheet_idx: usize) -> String {
382        if let Some(sheet_entry) = self.workbook_xml.sheets.sheets.get(sheet_idx) {
383            if let Some(rel) = self
384                .workbook_rels
385                .relationships
386                .iter()
387                .find(|r| r.id == sheet_entry.r_id && r.rel_type == rel_types::WORKSHEET)
388            {
389                return resolve_relationship_target("xl/workbook.xml", &rel.target);
390            }
391        }
392        format!("xl/worksheets/sheet{}.xml", sheet_idx + 1)
393    }
394
395    /// Create a forward-only streaming reader for the named sheet.
396    ///
397    /// The reader processes worksheet XML row-by-row without materializing the
398    /// full DOM, enabling bounded-memory processing of large worksheets. The
399    /// workbook's shared string table and optional `sheet_rows` limit are
400    /// passed through to the reader.
401    ///
402    /// The XML bytes come from `raw_sheet_xml` (deferred sheets in Lazy/Stream
403    /// mode) or from a freshly hydrated worksheet serialized back to bytes.
404    pub fn open_sheet_reader(
405        &self,
406        sheet: &str,
407    ) -> Result<
408        crate::stream_reader::SheetStreamReader<'_, std::io::BufReader<std::io::Cursor<Vec<u8>>>>,
409    > {
410        let idx = self.sheet_index(sheet)?;
411        let xml_bytes = self.sheet_xml_bytes(idx)?;
412        let cursor = std::io::Cursor::new(xml_bytes);
413        let buf_reader = std::io::BufReader::new(cursor);
414        Ok(crate::stream_reader::SheetStreamReader::new(
415            buf_reader,
416            &self.sst_runtime,
417            self.sheet_rows_limit,
418        ))
419    }
420
421    /// Create an owned forward-only streaming reader for the named sheet.
422    ///
423    /// Unlike [`open_sheet_reader`], the returned reader owns its shared
424    /// string table snapshot and XML bytes, so it has no lifetime tied to
425    /// the workbook. This is suitable for FFI contexts (e.g., napi classes)
426    /// where lifetime parameters are not supported.
427    pub fn open_sheet_reader_owned(
428        &self,
429        sheet: &str,
430    ) -> Result<crate::stream_reader::OwnedSheetStreamReader> {
431        let idx = self.sheet_index(sheet)?;
432        let xml_bytes = self.sheet_xml_bytes(idx)?;
433        let sst_snapshot = self.sst_runtime.clone_for_read();
434        Ok(crate::stream_reader::OwnedSheetStreamReader::new(
435            xml_bytes,
436            sst_snapshot,
437            self.sheet_rows_limit,
438        ))
439    }
440
441    /// Get the raw XML bytes for a sheet by index.
442    ///
443    /// When the OnceLock is uninitialised (Lazy/Stream deferred), raw bytes
444    /// from `raw_sheet_xml` are used so the DOM is never materialised. When
445    /// the OnceLock IS initialised (Eager parse or filtered-out sheet), the
446    /// parsed worksheet is serialised back so that `sheets(...)` filtering is
447    /// respected (filtered sheets have an empty worksheet placeholder).
448    ///
449    /// The returned bytes are cloned because the `SheetStreamReader` takes
450    /// ownership of its `BufRead` source.
451    fn sheet_xml_bytes(&self, idx: usize) -> Result<Vec<u8>> {
452        // If the OnceLock is already initialised (eager parse OR filtered-out
453        // placeholder), serialise whatever is stored there. This ensures
454        // filtered-out sheets yield an empty worksheet.
455        if let Some(ws) = self.worksheets[idx].1.get() {
456            let xml = quick_xml::se::to_string(ws)
457                .map_err(|e| Error::Internal(format!("failed to serialize worksheet: {e}")))?;
458            return Ok(xml.into_bytes());
459        }
460        // Lazy/Stream deferred: OnceLock not yet initialised, use raw bytes.
461        if let Some(Some(bytes)) = self.raw_sheet_xml.get(idx) {
462            return Ok(bytes.clone());
463        }
464        Err(Error::Internal(format!(
465            "sheet at index {} has no materialized or deferred data",
466            idx
467        )))
468    }
469}