sheetkit_core/
sst.rs

1//! Runtime shared string table.
2//!
3//! The [`SharedStringTable`] provides an efficient in-memory index for looking
4//! up and inserting shared strings. It bridges the gap between the XML-level
5//! [`sheetkit_xml::shared_strings::Sst`] and the high-level cell API.
6
7use std::collections::HashMap;
8use std::sync::Arc;
9
10use sheetkit_xml::shared_strings::{Si, Sst, T};
11
12use crate::rich_text::{xml_to_run, RichTextRun};
13
14/// Runtime shared string table for efficient string lookup and insertion.
15///
16/// Maintains both an ordered list of strings (for index-based lookup) and a
17/// reverse hash map (for deduplication when inserting). Uses `Arc<str>` so that
18/// both collections share the same string allocation. Original [`Si`] items
19/// loaded from file are preserved so that `to_sst()` can reuse them without
20/// cloning the string data a second time.
21#[derive(Debug)]
22pub struct SharedStringTable {
23    strings: Vec<Arc<str>>,
24    index_map: HashMap<Arc<str>, usize>,
25    /// Original or constructed Si items, parallel to `strings`.
26    /// `None` for plain-text items added via `add()` / `add_owned()`.
27    si_items: Vec<Option<Si>>,
28}
29
30impl SharedStringTable {
31    /// Create a new, empty shared string table.
32    pub fn new() -> Self {
33        Self {
34            strings: Vec::new(),
35            index_map: HashMap::new(),
36            si_items: Vec::new(),
37        }
38    }
39
40    /// Build from an XML [`Sst`], taking ownership to avoid cloning items.
41    ///
42    /// Plain-text items consume the `t.value` String directly (zero-copy into
43    /// `Arc<str>`). Rich-text items concatenate all run texts. Pre-sizes
44    /// internal containers.
45    pub fn from_sst(sst: Sst) -> Self {
46        let cap = sst.items.len();
47        let mut strings = Vec::with_capacity(cap);
48        let mut index_map = HashMap::with_capacity(cap);
49        let mut si_items: Vec<Option<Si>> = Vec::with_capacity(cap);
50
51        for mut si in sst.items {
52            let is_rich = si.t.is_none() && !si.r.is_empty();
53            let has_space_attr = si.t.as_ref().is_some_and(|t| t.xml_space.is_some());
54            let preserve_si = is_rich || has_space_attr;
55
56            let text: Arc<str> = if preserve_si {
57                // Rich text or space-preserved: extract text without consuming
58                // the Si, since we need to store it.
59                si_to_string(&si).into()
60            } else if let Some(ref mut t) = si.t {
61                // Plain text: take ownership of the string to avoid cloning.
62                std::mem::take(&mut t.value).into()
63            } else {
64                // Empty item.
65                Arc::from("")
66            };
67
68            let idx = strings.len();
69            index_map.entry(Arc::clone(&text)).or_insert(idx);
70            if preserve_si {
71                si_items.push(Some(si));
72            } else {
73                si_items.push(None);
74            }
75            strings.push(text);
76        }
77
78        Self {
79            strings,
80            index_map,
81            si_items,
82        }
83    }
84
85    /// Convert back to an XML [`Sst`] struct for serialization.
86    ///
87    /// Reuses stored [`Si`] items for entries loaded from file. Builds new
88    /// `Si` items only for strings added at runtime.
89    pub fn to_sst(&self) -> Sst {
90        let items: Vec<Si> = self
91            .strings
92            .iter()
93            .enumerate()
94            .map(|(idx, s)| {
95                if let Some(ref si) = self.si_items[idx] {
96                    si.clone()
97                } else {
98                    Si {
99                        t: Some(T {
100                            xml_space: if needs_space_preserve(s) {
101                                Some("preserve".to_string())
102                            } else {
103                                None
104                            },
105                            value: s.to_string(),
106                        }),
107                        r: vec![],
108                    }
109                }
110            })
111            .collect();
112
113        let len = items.len() as u32;
114        Sst {
115            xmlns: sheetkit_xml::namespaces::SPREADSHEET_ML.to_string(),
116            count: Some(len),
117            unique_count: Some(len),
118            items,
119        }
120    }
121
122    /// Get a string by its index.
123    pub fn get(&self, index: usize) -> Option<&str> {
124        self.strings.get(index).map(|s| &**s)
125    }
126
127    /// Add a string by reference, returning its index.
128    ///
129    /// If the string already exists, the existing index is returned (dedup).
130    pub fn add(&mut self, s: &str) -> usize {
131        if let Some(&idx) = self.index_map.get(s) {
132            return idx;
133        }
134        let idx = self.strings.len();
135        let rc: Arc<str> = s.into();
136        self.strings.push(Arc::clone(&rc));
137        self.index_map.insert(rc, idx);
138        self.si_items.push(None);
139        idx
140    }
141
142    /// Add a string by value, returning its index.
143    ///
144    /// Avoids one allocation compared to `add()` when the caller already
145    /// owns a `String`.
146    pub fn add_owned(&mut self, s: String) -> usize {
147        if let Some(&idx) = self.index_map.get(s.as_str()) {
148            return idx;
149        }
150        let idx = self.strings.len();
151        let rc: Arc<str> = s.into();
152        self.index_map.insert(Arc::clone(&rc), idx);
153        self.strings.push(rc);
154        self.si_items.push(None);
155        idx
156    }
157
158    /// Add rich text runs, returning the SST index.
159    ///
160    /// The plain-text concatenation of the runs is used for deduplication.
161    pub fn add_rich_text(&mut self, runs: &[RichTextRun]) -> usize {
162        let plain: String = runs.iter().map(|r| r.text.as_str()).collect();
163        if let Some(&idx) = self.index_map.get(plain.as_str()) {
164            return idx;
165        }
166        let idx = self.strings.len();
167        let rc: Arc<str> = plain.into();
168        self.index_map.insert(Arc::clone(&rc), idx);
169        self.strings.push(rc);
170        let si = crate::rich_text::runs_to_si(runs);
171        self.si_items.push(Some(si));
172        idx
173    }
174
175    /// Get rich text runs for an SST entry, if it has formatting.
176    ///
177    /// Returns `None` for plain-text entries.
178    pub fn get_rich_text(&self, index: usize) -> Option<Vec<RichTextRun>> {
179        self.si_items
180            .get(index)
181            .and_then(|opt| opt.as_ref())
182            .filter(|si| !si.r.is_empty())
183            .map(|si| si.r.iter().map(xml_to_run).collect())
184    }
185
186    /// Number of unique strings.
187    pub fn len(&self) -> usize {
188        self.strings.len()
189    }
190
191    /// Returns `true` if the table contains no strings.
192    pub fn is_empty(&self) -> bool {
193        self.strings.is_empty()
194    }
195
196    /// Create a read-only clone suitable for use by an owned stream reader.
197    ///
198    /// Clones the string list (`Arc<str>` refcount bumps only) and the Si
199    /// items, but omits the reverse `index_map` since the clone is read-only.
200    /// This is cheaper than a full clone and sufficient for SST index lookups.
201    pub fn clone_for_read(&self) -> Self {
202        Self {
203            strings: self.strings.clone(),
204            index_map: HashMap::new(),
205            si_items: self.si_items.clone(),
206        }
207    }
208}
209
210impl Default for SharedStringTable {
211    fn default() -> Self {
212        Self::new()
213    }
214}
215
216/// Check whether a string needs `xml:space="preserve"`.
217fn needs_space_preserve(s: &str) -> bool {
218    s.starts_with(' ')
219        || s.ends_with(' ')
220        || s.contains("  ")
221        || s.contains('\n')
222        || s.contains('\t')
223}
224
225/// Extract the plain-text content of a shared string item.
226///
227/// For plain items, returns `si.t.value`. For rich-text items, concatenates
228/// all run texts.
229fn si_to_string(si: &Si) -> String {
230    if let Some(ref t) = si.t {
231        t.value.clone()
232    } else {
233        // Rich text: concatenate all runs.
234        si.r.iter().map(|r| r.t.value.as_str()).collect()
235    }
236}
237
238#[cfg(test)]
239mod tests {
240    use super::*;
241    use sheetkit_xml::shared_strings::{Si, Sst, R, T};
242
243    #[test]
244    fn test_sst_new_is_empty() {
245        let table = SharedStringTable::new();
246        assert!(table.is_empty());
247        assert_eq!(table.len(), 0);
248    }
249
250    #[test]
251    fn test_sst_add_returns_index() {
252        let mut table = SharedStringTable::new();
253        assert_eq!(table.add("hello"), 0);
254        assert_eq!(table.add("world"), 1);
255        assert_eq!(table.add("foo"), 2);
256        assert_eq!(table.len(), 3);
257    }
258
259    #[test]
260    fn test_sst_add_deduplicates() {
261        let mut table = SharedStringTable::new();
262        assert_eq!(table.add("hello"), 0);
263        assert_eq!(table.add("world"), 1);
264        assert_eq!(table.add("hello"), 0); // duplicate -> same index
265        assert_eq!(table.len(), 2); // only 2 unique strings
266    }
267
268    #[test]
269    fn test_sst_add_owned() {
270        let mut table = SharedStringTable::new();
271        assert_eq!(table.add_owned("hello".to_string()), 0);
272        assert_eq!(table.add_owned("world".to_string()), 1);
273        assert_eq!(table.add_owned("hello".to_string()), 0); // dedup
274        assert_eq!(table.len(), 2);
275        assert_eq!(table.get(0), Some("hello"));
276        assert_eq!(table.get(1), Some("world"));
277    }
278
279    #[test]
280    fn test_sst_get() {
281        let mut table = SharedStringTable::new();
282        table.add("alpha");
283        table.add("beta");
284
285        assert_eq!(table.get(0), Some("alpha"));
286        assert_eq!(table.get(1), Some("beta"));
287        assert_eq!(table.get(2), None);
288    }
289
290    #[test]
291    fn test_sst_from_xml_and_back() {
292        let xml_sst = Sst {
293            xmlns: sheetkit_xml::namespaces::SPREADSHEET_ML.to_string(),
294            count: Some(3),
295            unique_count: Some(3),
296            items: vec![
297                Si {
298                    t: Some(T {
299                        xml_space: None,
300                        value: "Name".to_string(),
301                    }),
302                    r: vec![],
303                },
304                Si {
305                    t: Some(T {
306                        xml_space: None,
307                        value: "Age".to_string(),
308                    }),
309                    r: vec![],
310                },
311                Si {
312                    t: Some(T {
313                        xml_space: None,
314                        value: "City".to_string(),
315                    }),
316                    r: vec![],
317                },
318            ],
319        };
320
321        let table = SharedStringTable::from_sst(xml_sst);
322        assert_eq!(table.len(), 3);
323        assert_eq!(table.get(0), Some("Name"));
324        assert_eq!(table.get(1), Some("Age"));
325        assert_eq!(table.get(2), Some("City"));
326
327        // Convert back
328        let back = table.to_sst();
329        assert_eq!(back.items.len(), 3);
330        assert_eq!(back.items[0].t.as_ref().unwrap().value, "Name");
331        assert_eq!(back.items[1].t.as_ref().unwrap().value, "Age");
332        assert_eq!(back.items[2].t.as_ref().unwrap().value, "City");
333        assert_eq!(back.count, Some(3));
334        assert_eq!(back.unique_count, Some(3));
335    }
336
337    #[test]
338    fn test_sst_from_xml_rich_text() {
339        let xml_sst = Sst {
340            xmlns: sheetkit_xml::namespaces::SPREADSHEET_ML.to_string(),
341            count: Some(1),
342            unique_count: Some(1),
343            items: vec![Si {
344                t: None,
345                r: vec![
346                    R {
347                        r_pr: None,
348                        t: T {
349                            xml_space: None,
350                            value: "Bold".to_string(),
351                        },
352                    },
353                    R {
354                        r_pr: None,
355                        t: T {
356                            xml_space: None,
357                            value: " Normal".to_string(),
358                        },
359                    },
360                ],
361            }],
362        };
363
364        let table = SharedStringTable::from_sst(xml_sst);
365        assert_eq!(table.len(), 1);
366        assert_eq!(table.get(0), Some("Bold Normal"));
367    }
368
369    #[test]
370    fn test_sst_default() {
371        let table = SharedStringTable::default();
372        assert!(table.is_empty());
373    }
374
375    #[test]
376    fn test_add_rich_text() {
377        let mut table = SharedStringTable::new();
378        let runs = vec![
379            RichTextRun {
380                text: "Hello ".to_string(),
381                font: None,
382                size: None,
383                bold: true,
384                italic: false,
385                color: None,
386            },
387            RichTextRun {
388                text: "World".to_string(),
389                font: None,
390                size: None,
391                bold: false,
392                italic: false,
393                color: None,
394            },
395        ];
396        let idx = table.add_rich_text(&runs);
397        assert_eq!(idx, 0);
398        assert_eq!(table.get(0), Some("Hello World"));
399        assert!(table.get_rich_text(0).is_some());
400    }
401
402    #[test]
403    fn test_get_rich_text_none_for_plain() {
404        let mut table = SharedStringTable::new();
405        table.add("plain");
406        assert!(table.get_rich_text(0).is_none());
407    }
408
409    #[test]
410    fn test_rich_text_roundtrip_through_sst() {
411        let xml_sst = Sst {
412            xmlns: sheetkit_xml::namespaces::SPREADSHEET_ML.to_string(),
413            count: Some(1),
414            unique_count: Some(1),
415            items: vec![Si {
416                t: None,
417                r: vec![
418                    R {
419                        r_pr: None,
420                        t: T {
421                            xml_space: None,
422                            value: "Bold".to_string(),
423                        },
424                    },
425                    R {
426                        r_pr: None,
427                        t: T {
428                            xml_space: None,
429                            value: " Normal".to_string(),
430                        },
431                    },
432                ],
433            }],
434        };
435        let table = SharedStringTable::from_sst(xml_sst);
436        let back = table.to_sst();
437        assert!(back.items[0].t.is_none());
438        assert_eq!(back.items[0].r.len(), 2);
439    }
440
441    #[test]
442    fn test_space_preserve_roundtrip() {
443        let xml_sst = Sst {
444            xmlns: sheetkit_xml::namespaces::SPREADSHEET_ML.to_string(),
445            count: Some(1),
446            unique_count: Some(1),
447            items: vec![Si {
448                t: Some(T {
449                    xml_space: Some("preserve".to_string()),
450                    value: " leading space".to_string(),
451                }),
452                r: vec![],
453            }],
454        };
455        let table = SharedStringTable::from_sst(xml_sst);
456        let back = table.to_sst();
457        assert_eq!(
458            back.items[0].t.as_ref().unwrap().xml_space,
459            Some("preserve".to_string())
460        );
461    }
462
463    #[test]
464    fn test_add_owned_then_to_sst() {
465        let mut table = SharedStringTable::new();
466        table.add_owned("test".to_string());
467        let sst = table.to_sst();
468        assert_eq!(sst.items.len(), 1);
469        assert_eq!(sst.items[0].t.as_ref().unwrap().value, "test");
470    }
471
472    #[test]
473    fn test_from_sst_zero_copy_plain_text() {
474        let xml_sst = Sst {
475            xmlns: sheetkit_xml::namespaces::SPREADSHEET_ML.to_string(),
476            count: Some(3),
477            unique_count: Some(3),
478            items: vec![
479                Si {
480                    t: Some(T {
481                        xml_space: None,
482                        value: "Alpha".to_string(),
483                    }),
484                    r: vec![],
485                },
486                Si {
487                    t: Some(T {
488                        xml_space: None,
489                        value: "Beta".to_string(),
490                    }),
491                    r: vec![],
492                },
493                Si {
494                    t: Some(T {
495                        xml_space: None,
496                        value: "Gamma".to_string(),
497                    }),
498                    r: vec![],
499                },
500            ],
501        };
502        let table = SharedStringTable::from_sst(xml_sst);
503        assert_eq!(table.len(), 3);
504        assert_eq!(table.get(0), Some("Alpha"));
505        assert_eq!(table.get(1), Some("Beta"));
506        assert_eq!(table.get(2), Some("Gamma"));
507        let back = table.to_sst();
508        assert_eq!(back.items[0].t.as_ref().unwrap().value, "Alpha");
509        assert_eq!(back.items[1].t.as_ref().unwrap().value, "Beta");
510        assert_eq!(back.items[2].t.as_ref().unwrap().value, "Gamma");
511    }
512
513    #[test]
514    fn test_from_sst_mixed_plain_and_rich_text() {
515        let xml_sst = Sst {
516            xmlns: sheetkit_xml::namespaces::SPREADSHEET_ML.to_string(),
517            count: Some(3),
518            unique_count: Some(3),
519            items: vec![
520                Si {
521                    t: Some(T {
522                        xml_space: None,
523                        value: "Plain".to_string(),
524                    }),
525                    r: vec![],
526                },
527                Si {
528                    t: None,
529                    r: vec![
530                        R {
531                            r_pr: None,
532                            t: T {
533                                xml_space: None,
534                                value: "Rich".to_string(),
535                            },
536                        },
537                        R {
538                            r_pr: None,
539                            t: T {
540                                xml_space: None,
541                                value: " Text".to_string(),
542                            },
543                        },
544                    ],
545                },
546                Si {
547                    t: Some(T {
548                        xml_space: Some("preserve".to_string()),
549                        value: " spaced ".to_string(),
550                    }),
551                    r: vec![],
552                },
553            ],
554        };
555        let table = SharedStringTable::from_sst(xml_sst);
556        assert_eq!(table.len(), 3);
557        assert_eq!(table.get(0), Some("Plain"));
558        assert_eq!(table.get(1), Some("Rich Text"));
559        assert_eq!(table.get(2), Some(" spaced "));
560        assert!(table.get_rich_text(0).is_none());
561        assert!(table.get_rich_text(1).is_some());
562    }
563
564    #[test]
565    fn test_from_sst_empty_items() {
566        let xml_sst = Sst {
567            xmlns: sheetkit_xml::namespaces::SPREADSHEET_ML.to_string(),
568            count: Some(0),
569            unique_count: Some(0),
570            items: vec![],
571        };
572        let table = SharedStringTable::from_sst(xml_sst);
573        assert!(table.is_empty());
574    }
575}