sheetkit_core/workbook/mod.rs
1//! Workbook file I/O: reading and writing `.xlsx` files.
2//!
3//! An `.xlsx` file is a ZIP archive containing XML parts. This module provides
4//! [`Workbook`] which holds the parsed XML structures in memory and can
5//! serialize them back to a valid `.xlsx` file.
6
7use std::collections::{HashMap, HashSet};
8use std::io::{Read as _, Write as _};
9use std::path::Path;
10use std::sync::OnceLock;
11
12use serde::Serialize;
13use sheetkit_xml::chart::ChartSpace;
14use sheetkit_xml::comments::Comments;
15use sheetkit_xml::content_types::{
16 mime_types, ContentTypeDefault, ContentTypeOverride, ContentTypes,
17};
18
19/// The OOXML package format, determined by the workbook content type in
20/// `[Content_Types].xml`. Controls which content type string is emitted for
21/// `xl/workbook.xml` on save.
22#[derive(Debug, Default, Clone, Copy, PartialEq, Eq)]
23pub enum WorkbookFormat {
24 /// Standard spreadsheet (.xlsx).
25 #[default]
26 Xlsx,
27 /// Macro-enabled spreadsheet (.xlsm).
28 Xlsm,
29 /// Template (.xltx).
30 Xltx,
31 /// Macro-enabled template (.xltm).
32 Xltm,
33 /// Macro-enabled add-in (.xlam).
34 Xlam,
35}
36
37impl WorkbookFormat {
38 /// Infer the format from a workbook content type string found in
39 /// `[Content_Types].xml`.
40 pub fn from_content_type(ct: &str) -> Option<Self> {
41 match ct {
42 mime_types::WORKBOOK => Some(Self::Xlsx),
43 mime_types::WORKBOOK_MACRO => Some(Self::Xlsm),
44 mime_types::WORKBOOK_TEMPLATE => Some(Self::Xltx),
45 mime_types::WORKBOOK_TEMPLATE_MACRO => Some(Self::Xltm),
46 mime_types::WORKBOOK_ADDIN_MACRO => Some(Self::Xlam),
47 _ => None,
48 }
49 }
50
51 /// Infer the format from a file extension (case-insensitive, without the
52 /// leading dot). Returns `None` for unrecognized extensions.
53 pub fn from_extension(ext: &str) -> Option<Self> {
54 match ext.to_ascii_lowercase().as_str() {
55 "xlsx" => Some(Self::Xlsx),
56 "xlsm" => Some(Self::Xlsm),
57 "xltx" => Some(Self::Xltx),
58 "xltm" => Some(Self::Xltm),
59 "xlam" => Some(Self::Xlam),
60 _ => None,
61 }
62 }
63
64 /// Return the OOXML content type string for this format.
65 pub fn content_type(self) -> &'static str {
66 match self {
67 Self::Xlsx => mime_types::WORKBOOK,
68 Self::Xlsm => mime_types::WORKBOOK_MACRO,
69 Self::Xltx => mime_types::WORKBOOK_TEMPLATE,
70 Self::Xltm => mime_types::WORKBOOK_TEMPLATE_MACRO,
71 Self::Xlam => mime_types::WORKBOOK_ADDIN_MACRO,
72 }
73 }
74}
75
76use sheetkit_xml::drawing::{MarkerType, WsDr};
77use sheetkit_xml::relationships::{self, rel_types, Relationship, Relationships};
78use sheetkit_xml::shared_strings::Sst;
79use sheetkit_xml::styles::StyleSheet;
80use sheetkit_xml::workbook::{WorkbookProtection, WorkbookXml};
81use sheetkit_xml::worksheet::{Cell, CellFormula, CellTypeTag, DrawingRef, Row, WorksheetXml};
82use zip::write::SimpleFileOptions;
83use zip::CompressionMethod;
84
85use crate::cell::CellValue;
86use crate::cell_ref_shift::shift_cell_references_in_text;
87use crate::chart::ChartConfig;
88use crate::comment::CommentConfig;
89use crate::conditional::ConditionalFormatRule;
90use crate::error::{Error, Result};
91use crate::image::ImageConfig;
92use crate::pivot::{PivotTableConfig, PivotTableInfo};
93use crate::protection::WorkbookProtectionConfig;
94use crate::sst::SharedStringTable;
95use crate::threaded_comment::{PersonData, PersonInput, ThreadedCommentData, ThreadedCommentInput};
96use crate::utils::cell_ref::{cell_name_to_coordinates, column_name_to_number};
97use crate::utils::constants::MAX_CELL_CHARS;
98use crate::validation::DataValidationConfig;
99use crate::workbook_paths::{
100 default_relationships, relationship_part_path, relative_relationship_target,
101 resolve_relationship_target,
102};
103
104#[path = "aux_parts.rs"]
105pub(crate) mod aux;
106mod cell_ops;
107mod data;
108mod drawing;
109mod features;
110mod io;
111mod open_options;
112mod sheet_ops;
113mod source;
114
115pub use open_options::{AuxParts, OpenOptions, ReadMode};
116pub(crate) use source::PackageSource;
117
118/// Helper to initialize an `OnceLock<WorksheetXml>` with a value at
119/// construction time. Avoids repeating the `set`+`unwrap` pattern.
120pub(crate) fn initialized_lock(ws: WorksheetXml) -> OnceLock<WorksheetXml> {
121 let lock = OnceLock::new();
122 let _ = lock.set(ws);
123 lock
124}
125
126/// XML declaration prepended to every XML part in the package.
127const XML_DECLARATION: &str = r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>"#;
128
129/// In-memory representation of an `.xlsx` workbook.
130pub struct Workbook {
131 format: WorkbookFormat,
132 content_types: ContentTypes,
133 package_rels: Relationships,
134 workbook_xml: WorkbookXml,
135 workbook_rels: Relationships,
136 /// Per-sheet worksheet XML, stored as `(name, OnceLock<WorksheetXml>)`.
137 /// When a sheet is eagerly parsed, the `OnceLock` is initialized at open
138 /// time. When a sheet is deferred (lazy mode or filtered out), the lock
139 /// is empty and `raw_sheet_xml[i]` holds the raw bytes; the first call
140 /// to [`worksheet_ref`] or [`worksheet_mut`] hydrates the lock on demand.
141 worksheets: Vec<(String, OnceLock<WorksheetXml>)>,
142 stylesheet: StyleSheet,
143 sst_runtime: SharedStringTable,
144 /// Per-sheet comments, parallel to the `worksheets` vector.
145 sheet_comments: Vec<Option<Comments>>,
146 /// Chart parts: (zip path like "xl/charts/chart1.xml", ChartSpace data).
147 charts: Vec<(String, ChartSpace)>,
148 /// Chart parts preserved as raw XML when typed parsing is not supported.
149 raw_charts: Vec<(String, Vec<u8>)>,
150 /// Drawing parts: (zip path like "xl/drawings/drawing1.xml", WsDr data).
151 drawings: Vec<(String, WsDr)>,
152 /// Image parts: (zip path like "xl/media/image1.png", raw bytes).
153 images: Vec<(String, Vec<u8>)>,
154 /// Maps sheet index -> drawing index in `drawings`.
155 #[allow(dead_code)]
156 worksheet_drawings: HashMap<usize, usize>,
157 /// Per-sheet worksheet relationship files.
158 worksheet_rels: HashMap<usize, Relationships>,
159 /// Per-drawing relationship files: drawing_index -> Relationships.
160 drawing_rels: HashMap<usize, Relationships>,
161 /// Core document properties (docProps/core.xml).
162 core_properties: Option<sheetkit_xml::doc_props::CoreProperties>,
163 /// Extended/application properties (docProps/app.xml).
164 app_properties: Option<sheetkit_xml::doc_props::ExtendedProperties>,
165 /// Custom properties (docProps/custom.xml).
166 custom_properties: Option<sheetkit_xml::doc_props::CustomProperties>,
167 /// Pivot table parts: (zip path, PivotTableDefinition data).
168 pivot_tables: Vec<(String, sheetkit_xml::pivot_table::PivotTableDefinition)>,
169 /// Pivot cache definition parts: (zip path, PivotCacheDefinition data).
170 pivot_cache_defs: Vec<(String, sheetkit_xml::pivot_cache::PivotCacheDefinition)>,
171 /// Pivot cache records parts: (zip path, PivotCacheRecords data).
172 pivot_cache_records: Vec<(String, sheetkit_xml::pivot_cache::PivotCacheRecords)>,
173 /// Raw theme XML bytes from xl/theme/theme1.xml (preserved for round-trip).
174 theme_xml: Option<Vec<u8>>,
175 /// Parsed theme colors from the theme XML.
176 theme_colors: sheetkit_xml::theme::ThemeColors,
177 /// Per-sheet sparkline configurations, parallel to the `worksheets` vector.
178 sheet_sparklines: Vec<Vec<crate::sparkline::SparklineConfig>>,
179 /// Per-sheet VML drawing bytes (for legacy comment rendering), parallel to `worksheets`.
180 /// `None` means no VML part exists for that sheet.
181 sheet_vml: Vec<Option<Vec<u8>>>,
182 /// ZIP entries not recognized by the parser, preserved for round-trip fidelity.
183 /// Each entry is (zip_path, raw_bytes).
184 unknown_parts: Vec<(String, Vec<u8>)>,
185 /// Typed index of auxiliary parts deferred during Lazy/Stream open.
186 /// Stores raw bytes grouped by category (comments, charts, doc props, etc.)
187 /// and supports on-demand hydration with dirty tracking.
188 deferred_parts: aux::DeferredAuxParts,
189 /// Raw VBA project binary blob (`xl/vbaProject.bin`), preserved for round-trip
190 /// and used for VBA module extraction. `None` for non-macro workbooks.
191 vba_blob: Option<Vec<u8>>,
192 /// Table parts: (zip path like "xl/tables/table1.xml", TableXml data, sheet_index).
193 tables: Vec<(String, sheetkit_xml::table::TableXml, usize)>,
194 /// Raw XML bytes for sheets that were not parsed during open.
195 /// Parallel to `worksheets`. `Some(bytes)` means the sheet XML has not
196 /// been deserialized: either filtered out by the `sheets` option, or
197 /// deferred in Lazy/Stream mode. The bytes are written directly on save
198 /// if the corresponding `OnceLock` in `worksheets` was never initialized.
199 raw_sheet_xml: Vec<Option<Vec<u8>>>,
200 /// Per-sheet dirty flag, parallel to `worksheets`. A sheet is marked
201 /// dirty when it is mutated (via `worksheet_mut`, `set_cell_value`, etc.).
202 /// Clean sheets with available raw bytes are written via passthrough on
203 /// save, avoiding serialization overhead.
204 sheet_dirty: Vec<bool>,
205 /// Slicer definition parts: (zip path, SlicerDefinitions data).
206 slicer_defs: Vec<(String, sheetkit_xml::slicer::SlicerDefinitions)>,
207 /// Slicer cache definition parts: (zip path, raw XML string).
208 slicer_caches: Vec<(String, sheetkit_xml::slicer::SlicerCacheDefinition)>,
209 /// Per-sheet threaded comments (Excel 2019+), parallel to the `worksheets` vector.
210 sheet_threaded_comments: Vec<Option<sheetkit_xml::threaded_comment::ThreadedComments>>,
211 /// Person list shared across all sheets (for threaded comment authors).
212 person_list: sheetkit_xml::threaded_comment::PersonList,
213 /// Per-sheet form control configurations, parallel to `worksheets`.
214 sheet_form_controls: Vec<Vec<crate::control::FormControlConfig>>,
215 /// O(1) sheet name -> index lookup cache. Must be kept in sync with
216 /// `worksheets` via [`rebuild_sheet_index`].
217 sheet_name_index: HashMap<String, usize>,
218 /// Streamed sheet data keyed by sheet index. During save, these sheets
219 /// are written by streaming from their temp files instead of serializing
220 /// the (empty placeholder) WorksheetXml.
221 streamed_sheets: HashMap<usize, crate::stream::StreamedSheetData>,
222 /// Backing storage for the xlsx package, retained for lazy part access.
223 #[allow(dead_code)]
224 package_source: Option<PackageSource>,
225 /// Read mode used when this workbook was opened.
226 read_mode: ReadMode,
227 /// Optional row limit from `OpenOptions::sheet_rows`, applied during
228 /// on-demand hydration of deferred sheets.
229 sheet_rows_limit: Option<u32>,
230}
231
232impl Workbook {
233 /// Return the detected or assigned workbook format.
234 pub fn format(&self) -> WorkbookFormat {
235 self.format
236 }
237
238 /// Set the workbook format. This determines the content type written for
239 /// `xl/workbook.xml` on save.
240 pub fn set_format(&mut self, format: WorkbookFormat) {
241 self.format = format;
242 }
243
244 /// Get the 0-based index of a sheet by name. O(1) via HashMap.
245 pub(crate) fn sheet_index(&self, sheet: &str) -> Result<usize> {
246 self.sheet_name_index
247 .get(sheet)
248 .copied()
249 .ok_or_else(|| Error::SheetNotFound {
250 name: sheet.to_string(),
251 })
252 }
253
254 /// Invalidate streamed data for a sheet by index. Must be called before
255 /// any mutation to a sheet that may have been created via StreamWriter,
256 /// so that the normal WorksheetXml serialization path is used on save.
257 pub(crate) fn invalidate_streamed(&mut self, idx: usize) {
258 self.streamed_sheets.remove(&idx);
259 }
260
261 /// Mark a sheet as dirty (modified). Dirty sheets are always serialized
262 /// on save, even if raw bytes exist. Clean sheets can use raw-byte
263 /// passthrough for zero-cost round-trip.
264 pub(crate) fn mark_sheet_dirty(&mut self, idx: usize) {
265 if idx < self.sheet_dirty.len() {
266 self.sheet_dirty[idx] = true;
267 }
268 }
269
270 /// Check whether a sheet has been marked dirty since opening.
271 #[cfg(test)]
272 pub(crate) fn is_sheet_dirty(&self, idx: usize) -> bool {
273 self.sheet_dirty.get(idx).copied().unwrap_or(false)
274 }
275
276 /// Get a mutable reference to the worksheet XML for the named sheet.
277 ///
278 /// If the sheet has streamed data (from [`apply_stream_writer`]), the
279 /// streamed entry is removed so that subsequent edits are not silently
280 /// ignored on save. Deferred sheets are hydrated on demand.
281 pub(crate) fn worksheet_mut(&mut self, sheet: &str) -> Result<&mut WorksheetXml> {
282 let idx = self.sheet_index(sheet)?;
283 self.invalidate_streamed(idx);
284 self.ensure_hydrated(idx)?;
285 self.mark_sheet_dirty(idx);
286 Ok(self.worksheets[idx].1.get_mut().unwrap())
287 }
288
289 /// Get an immutable reference to the worksheet XML for the named sheet.
290 /// Deferred sheets are hydrated lazily via `OnceLock`.
291 pub(crate) fn worksheet_ref(&self, sheet: &str) -> Result<&WorksheetXml> {
292 let idx = self.sheet_index(sheet)?;
293 self.worksheet_ref_by_index(idx)
294 }
295
296 /// Get an immutable reference to the worksheet XML by index.
297 /// Deferred sheets are hydrated lazily via `OnceLock`.
298 pub(crate) fn worksheet_ref_by_index(&self, idx: usize) -> Result<&WorksheetXml> {
299 if let Some(ws) = self.worksheets[idx].1.get() {
300 return Ok(ws);
301 }
302 // Hydrate from raw_sheet_xml on first access.
303 if let Some(Some(bytes)) = self.raw_sheet_xml.get(idx) {
304 let mut ws = io::deserialize_worksheet_xml(bytes)?;
305 if let Some(max_rows) = self.sheet_rows_limit {
306 ws.sheet_data.rows.truncate(max_rows as usize);
307 }
308 Ok(self.worksheets[idx].1.get_or_init(|| ws))
309 } else {
310 Err(Error::Internal(format!(
311 "sheet at index {} has no materialized or deferred data",
312 idx
313 )))
314 }
315 }
316
317 /// Public immutable reference to a worksheet's XML by sheet name.
318 /// Deferred sheets are hydrated lazily on first access.
319 pub fn worksheet_xml_ref(&self, sheet: &str) -> Result<&WorksheetXml> {
320 self.worksheet_ref(sheet)
321 }
322
323 /// Public immutable reference to the shared string table.
324 pub fn sst_ref(&self) -> &SharedStringTable {
325 &self.sst_runtime
326 }
327
328 /// Rebuild the sheet name -> index lookup after any structural change
329 /// to the worksheets vector.
330 pub(crate) fn rebuild_sheet_index(&mut self) {
331 self.sheet_name_index.clear();
332 for (i, (name, _ws_lock)) in self.worksheets.iter().enumerate() {
333 self.sheet_name_index.insert(name.clone(), i);
334 }
335 }
336
337 /// Ensure the sheet at the given index is hydrated (parsed from raw XML).
338 /// This is used by `&mut self` methods that need a mutable `OnceLock`
339 /// reference via `get_mut()`, which requires the lock to be initialized.
340 fn ensure_hydrated(&mut self, idx: usize) -> Result<()> {
341 if self.worksheets[idx].1.get().is_some() {
342 // OnceLock is set. If raw bytes are still present, this is a
343 // placeholder (filtered-out sheet with WorksheetXml::default()).
344 // Replace the placeholder with properly parsed data.
345 if let Some(Some(bytes)) = self.raw_sheet_xml.get(idx) {
346 let mut ws = io::deserialize_worksheet_xml(bytes)?;
347 if let Some(max_rows) = self.sheet_rows_limit {
348 ws.sheet_data.rows.truncate(max_rows as usize);
349 }
350 *self.worksheets[idx].1.get_mut().unwrap() = ws;
351 self.raw_sheet_xml[idx] = None;
352 }
353 return Ok(());
354 }
355 if let Some(Some(bytes)) = self.raw_sheet_xml.get(idx) {
356 let mut ws = io::deserialize_worksheet_xml(bytes)?;
357 if let Some(max_rows) = self.sheet_rows_limit {
358 ws.sheet_data.rows.truncate(max_rows as usize);
359 }
360 let _ = self.worksheets[idx].1.set(ws);
361 self.raw_sheet_xml[idx] = None;
362 Ok(())
363 } else {
364 Err(Error::Internal(format!(
365 "sheet at index {} has no materialized or deferred data",
366 idx
367 )))
368 }
369 }
370
371 /// Hydrate if needed and return a mutable reference to the worksheet
372 /// at the given index. Callers must hold `&mut self`.
373 pub(crate) fn worksheet_mut_by_index(&mut self, idx: usize) -> Result<&mut WorksheetXml> {
374 self.ensure_hydrated(idx)?;
375 self.mark_sheet_dirty(idx);
376 Ok(self.worksheets[idx].1.get_mut().unwrap())
377 }
378
379 /// Resolve the part path for a sheet index from workbook relationships.
380 /// Falls back to the default `xl/worksheets/sheet{N}.xml` naming.
381 pub(crate) fn sheet_part_path(&self, sheet_idx: usize) -> String {
382 if let Some(sheet_entry) = self.workbook_xml.sheets.sheets.get(sheet_idx) {
383 if let Some(rel) = self
384 .workbook_rels
385 .relationships
386 .iter()
387 .find(|r| r.id == sheet_entry.r_id && r.rel_type == rel_types::WORKSHEET)
388 {
389 return resolve_relationship_target("xl/workbook.xml", &rel.target);
390 }
391 }
392 format!("xl/worksheets/sheet{}.xml", sheet_idx + 1)
393 }
394
395 /// Create a forward-only streaming reader for the named sheet.
396 ///
397 /// The reader processes worksheet XML row-by-row without materializing the
398 /// full DOM, enabling bounded-memory processing of large worksheets. The
399 /// workbook's shared string table and optional `sheet_rows` limit are
400 /// passed through to the reader.
401 ///
402 /// The XML bytes come from `raw_sheet_xml` (deferred sheets in Lazy/Stream
403 /// mode) or from a freshly hydrated worksheet serialized back to bytes.
404 pub fn open_sheet_reader(
405 &self,
406 sheet: &str,
407 ) -> Result<
408 crate::stream_reader::SheetStreamReader<'_, std::io::BufReader<std::io::Cursor<Vec<u8>>>>,
409 > {
410 let idx = self.sheet_index(sheet)?;
411 let xml_bytes = self.sheet_xml_bytes(idx)?;
412 let cursor = std::io::Cursor::new(xml_bytes);
413 let buf_reader = std::io::BufReader::new(cursor);
414 Ok(crate::stream_reader::SheetStreamReader::new(
415 buf_reader,
416 &self.sst_runtime,
417 self.sheet_rows_limit,
418 ))
419 }
420
421 /// Create an owned forward-only streaming reader for the named sheet.
422 ///
423 /// Unlike [`open_sheet_reader`], the returned reader owns its shared
424 /// string table snapshot and XML bytes, so it has no lifetime tied to
425 /// the workbook. This is suitable for FFI contexts (e.g., napi classes)
426 /// where lifetime parameters are not supported.
427 pub fn open_sheet_reader_owned(
428 &self,
429 sheet: &str,
430 ) -> Result<crate::stream_reader::OwnedSheetStreamReader> {
431 let idx = self.sheet_index(sheet)?;
432 let xml_bytes = self.sheet_xml_bytes(idx)?;
433 let sst_snapshot = self.sst_runtime.clone_for_read();
434 Ok(crate::stream_reader::OwnedSheetStreamReader::new(
435 xml_bytes,
436 sst_snapshot,
437 self.sheet_rows_limit,
438 ))
439 }
440
441 /// Get the raw XML bytes for a sheet by index.
442 ///
443 /// When the OnceLock is uninitialised (Lazy/Stream deferred), raw bytes
444 /// from `raw_sheet_xml` are used so the DOM is never materialised. When
445 /// the OnceLock IS initialised (Eager parse or filtered-out sheet), the
446 /// parsed worksheet is serialised back so that `sheets(...)` filtering is
447 /// respected (filtered sheets have an empty worksheet placeholder).
448 ///
449 /// The returned bytes are cloned because the `SheetStreamReader` takes
450 /// ownership of its `BufRead` source.
451 fn sheet_xml_bytes(&self, idx: usize) -> Result<Vec<u8>> {
452 // If the OnceLock is already initialised (eager parse OR filtered-out
453 // placeholder), serialise whatever is stored there. This ensures
454 // filtered-out sheets yield an empty worksheet.
455 if let Some(ws) = self.worksheets[idx].1.get() {
456 let xml = quick_xml::se::to_string(ws)
457 .map_err(|e| Error::Internal(format!("failed to serialize worksheet: {e}")))?;
458 return Ok(xml.into_bytes());
459 }
460 // Lazy/Stream deferred: OnceLock not yet initialised, use raw bytes.
461 if let Some(Some(bytes)) = self.raw_sheet_xml.get(idx) {
462 return Ok(bytes.clone());
463 }
464 Err(Error::Internal(format!(
465 "sheet at index {} has no materialized or deferred data",
466 idx
467 )))
468 }
469}