From 929b198331a2222528733e52236b4a5204a60aa7 Mon Sep 17 00:00:00 2001 From: nilsk Date: Sun, 15 Jun 2025 01:15:44 +0200 Subject: [PATCH 1/2] added a multithreaded version of `parse_all` and a benchmark module in the examples directory --- Cargo.toml | 3 +- README.md | 2 +- examples/performance_test.rs | 275 +++++++++++++++++++++++++++++++++++ src/container.rs | 103 ++++++++++++- src/lib.rs | 7 +- 5 files changed, 378 insertions(+), 12 deletions(-) create mode 100644 examples/performance_test.rs diff --git a/Cargo.toml b/Cargo.toml index 47669c6..e133227 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -30,4 +30,5 @@ zip = "2.2.2" roxmltree = "0.20.0" thiserror = "2.0.11" base64 = "0.22.1" -image = "0.24.9" \ No newline at end of file +image = "0.24.9" +rayon = "1.10.0" \ No newline at end of file diff --git a/README.md b/README.md index c56c816..c45ddf4 100644 --- a/README.md +++ b/README.md @@ -32,7 +32,7 @@ fn main() -> Result<(), Box> { .extract_images(true) .build(); let pptx_container = PptxContainer::open(Path::new("presentation.pptx"), config)?; - let slides = container.parse_all()?; + let slides = container.parse_all()?; // or `parse_all_multi_threaded()?` for slide in slides { // Convert each slide into Markdown diff --git a/examples/performance_test.rs b/examples/performance_test.rs new file mode 100644 index 0000000..b33a06f --- /dev/null +++ b/examples/performance_test.rs @@ -0,0 +1,275 @@ +//! Performance benchmark example for the pptx-to-md crate +//! +//! This example measures and compares the performance of different parsing approaches: +//! - Single-threaded parsing +//! - Single-threading wih streaming +//! - Optimized multithreaded parsing with Rayon +//! +//! It also provides timing for individual operations to identify bottlenecks. +//! +//! Run with: cargo run --release --example performance_test [iterations] + +use pptx_to_md::{ParserConfig, PptxContainer, Result}; +use rayon::prelude::*; +use std::env; +use std::path::Path; +use std::time::{Duration, Instant}; + +struct Benchmark { + name: String, + start_time: Instant, + results: Vec, +} + +impl Benchmark { + fn new(name: &str) -> Self { + println!("Starting benchmark: {}", name); + Benchmark { + name: name.to_string(), + start_time: Instant::now(), + results: Vec::new(), + } + } + + fn measure(&mut self, mut f: F) -> T + where + F: FnMut() -> T, + { + let start = Instant::now(); + let result = f(); + let duration = start.elapsed(); + self.results.push(duration); + println!(" Operation took: {:?}", duration); + result + } + + fn report(&self) { + if self.results.is_empty() { + println!("No measurements for {}", self.name); + return; + } + + let total = self.start_time.elapsed(); + let count = self.results.len(); + let sum: Duration = self.results.iter().sum(); + let avg = sum / count as u32; + let min = self.results.iter().min().unwrap(); + let max = self.results.iter().max().unwrap(); + + println!("\nBenchmark Results for {}", self.name); + println!("----------------------------"); + println!("Total time: {:?}", total); + println!("Operations: {}", count); + println!("Average time per operation: {:?}", avg); + println!("Min time: {:?}", min); + println!("Max time: {:?}", max); + println!("----------------------------\n"); + } +} + +fn main() -> Result<()> { + // Get the PPTX file path and optional iteration count from command line arguments + let args: Vec = env::args().collect(); + let pptx_path = if args.len() > 1 { + &args[1] + } else { + eprintln!("Usage: cargo run --example performance_test [iterations]"); + return Ok(()); + }; + + let iterations = if args.len() > 2 { + args[2].parse().unwrap_or(5) + } else { + 10 // Default to 10 iterations + }; + + println!("Performance testing with {} iterations on: {}", iterations, pptx_path); + + + + // =========== Single-threaded Approach =========== + let mut single_thread_bench = Benchmark::new("Single-threaded parsing"); + + let mut total_slides = 0; + + for i in 0..iterations { + println!("\nIteration {} (Single-threaded)", i + 1); + + // Measure container creation + let mut container = single_thread_bench.measure(|| { + let config = ParserConfig::builder() + .extract_images(true) + .build(); + PptxContainer::open(Path::new(pptx_path), config).expect("Failed to open PPTX") + }); + + println!(" Found {} slides in the presentation", container.slide_count); + + // Measure parsing + let slides = single_thread_bench.measure(|| { + container.parse_all().expect("Failed to parse slides") + }); + + // Measure conversion + let _md_content = single_thread_bench.measure(|| { + slides.iter() + .filter_map(|slide| slide.convert_to_md()) + .collect::>() + }); + + total_slides += slides.len(); + } + + single_thread_bench.report(); + println!("Average slides per presentation: {}", total_slides / iterations); + + + + // =========== Single-threaded Streamed Approach =========== + let mut single_thread_streamed_bench = Benchmark::new("Single-threaded streamed parsing"); + + total_slides = 0; + + for i in 0..iterations { + println!("\nIteration {} (Single-threaded streamed)", i + 1); + + // Measure container creation + let mut container = single_thread_streamed_bench.measure(|| { + let config = ParserConfig::builder() + .extract_images(true) + .build(); + PptxContainer::open(Path::new(pptx_path), config).expect("Failed to open PPTX") + }); + + println!(" Found {} slides in the presentation", container.slide_count); + + // Zähle die Slides im Voraus für die statistische Auswertung + let expected_slides = container.slide_count; + + // Measure slide processing (including parsing and conversion) + let slides_processed = single_thread_streamed_bench.measure(|| { + let mut processed = 0; + + // Process slides one by one using the iterator + for slide_result in container.iter_slides() { + match slide_result { + Ok(slide) => { + // Konvertiere den Slide zu Markdown + let _md_content = slide.convert_to_md(); + processed += 1; + }, + Err(e) => { + eprintln!("Error processing slide: {:?}", e); + } + } + } + + processed + }); + + println!(" Processed {} out of {} slides", slides_processed, expected_slides); + total_slides += slides_processed; + } + + single_thread_streamed_bench.report(); + println!("Average slides per presentation: {}", total_slides / iterations); + + + + // =========== Optimized Multi-threaded Approach =========== + let mut optimized_multi_thread_bench = Benchmark::new("Optimized Multi-threaded parsing"); + + total_slides = 0; + + for i in 0..iterations { + println!("\nIteration {} (Optimized Multi-threaded)", i + 1); + + // Container öffnen mit der gewünschten Konfiguration + let mut container = optimized_multi_thread_bench.measure(|| { + let config = ParserConfig::builder() + .extract_images(true) + .build(); + PptxContainer::open(Path::new(pptx_path), config).expect("Failed to open PPTX") + }); + + println!(" Found {} slides in the presentation", container.slide_count); + + // Verwende die neue optimierte Multi-Threading-Methode + let slides = optimized_multi_thread_bench.measure(|| { + container.parse_all_multi_threaded().expect("Failed to parse slides") + }); + + println!(" Successfully processed {} slides", slides.len()); + + // Parallel zu Markdown konvertieren (bleibt unverändert) + let _md_content = optimized_multi_thread_bench.measure(|| { + slides.par_iter() + .filter_map(|slide| slide.convert_to_md()) + .collect::>() + }); + + total_slides += slides.len(); + } + + optimized_multi_thread_bench.report(); + println!("Average slides per presentation: {}", total_slides / iterations); + + // =========== Performance Comparison =========== + if !single_thread_bench.results.is_empty() && + !single_thread_streamed_bench.results.is_empty() && + !optimized_multi_thread_bench.results.is_empty() { + + let single_avg: Duration = single_thread_bench.results.iter().sum::() / + single_thread_bench.results.len() as u32; + let single_streamed_avg: Duration = single_thread_streamed_bench.results.iter().sum::() / + single_thread_streamed_bench.results.len() as u32; + let optimized_multi_avg: Duration = optimized_multi_thread_bench.results.iter().sum::() / + optimized_multi_thread_bench.results.len() as u32; + + println!("\nPerformance Comparison"); + println!("====================="); + println!("Single-threaded average: {:?}", single_avg); + println!("Single-threaded streaming average: {:?}", single_streamed_avg); + println!("Optimized multi-threaded average: {:?}", optimized_multi_avg); + + // Compare single-threaded vs single-threaded streaming + if single_avg > single_streamed_avg { + let speedup = single_avg.as_secs_f64() / single_streamed_avg.as_secs_f64(); + println!("Single-threaded streaming is {:.2}x faster than single-threaded", speedup); + } else { + let slowdown = single_streamed_avg.as_secs_f64() / single_avg.as_secs_f64(); + println!("Single-threaded streaming is {:.2}x slower than single-threaded", slowdown); + } + + // Compare single-threaded vs optimized multithreaded + if single_avg > optimized_multi_avg { + let speedup = single_avg.as_secs_f64() / optimized_multi_avg.as_secs_f64(); + println!("Optimized multi-threaded is {:.2}x faster than single-threaded", speedup); + } else { + let slowdown = optimized_multi_avg.as_secs_f64() / single_avg.as_secs_f64(); + println!("Optimized multi-threaded is {:.2}x slower than single-threaded", slowdown); + } + + // Compare single-threaded streaming vs optimized multithreaded + if single_streamed_avg > optimized_multi_avg { + let speedup = single_streamed_avg.as_secs_f64() / optimized_multi_avg.as_secs_f64(); + println!("Optimized multi-threaded is {:.2}x faster than single-threaded streaming", speedup); + } else { + let slowdown = optimized_multi_avg.as_secs_f64() / single_streamed_avg.as_secs_f64(); + println!("Optimized multi-threaded is {:.2}x slower than single-threaded streaming", slowdown); + } + + // Determine the overall fastest approach + let fastest_approach = if single_avg <= single_streamed_avg && single_avg <= optimized_multi_avg { + "Single-threaded" + } else if single_streamed_avg <= single_avg && single_streamed_avg <= optimized_multi_avg { + "Single-threaded streaming" + } else { + "Optimized multi-threaded" + }; + + println!("\nOverall result: {} approach is the fastest for this workload.", fastest_approach); + } + + Ok(()) +} \ No newline at end of file diff --git a/src/container.rs b/src/container.rs index 7c74c67..b6c3987 100644 --- a/src/container.rs +++ b/src/container.rs @@ -1,10 +1,11 @@ use super::{Result, Slide}; +use crate::parser_config::ParserConfig; +use rayon::prelude::*; use std::{ collections::HashMap, io::Read, path::Path, }; -use crate::parser_config::ParserConfig; /// Holds the internal representation of a loaded PowerPoint (pptx) container. /// @@ -12,9 +13,10 @@ use crate::parser_config::ParserConfig; /// directly from a loaded pptx file. It parses and stores XML slides content, /// relationships (`rels`) files, and associated resources such as images. pub struct PptxContainer { - config: ParserConfig, + pub config: ParserConfig, archive: zip::ZipArchive, - slide_paths: Vec, + pub slide_paths: Vec, + pub slide_count: u32, } impl PptxContainer { @@ -41,6 +43,7 @@ impl PptxContainer { let mut archive = zip::ZipArchive::new(file)?; let mut slide_paths: Vec = Vec::new(); + let mut slide_count = 0; for i in 0..archive.len() { let file = archive.by_index(i)?; @@ -48,12 +51,13 @@ impl PptxContainer { if name.starts_with("ppt/slides/slide") && name.ends_with(".xml") { slide_paths.push(name); + slide_count += 1; } } slide_paths.sort(); - Ok(Self { archive, slide_paths, config }) + Ok(Self { archive, slide_paths, config, slide_count }) } /// Parses the data of all slides for each path present in the containers' `slide_path` vector. @@ -74,6 +78,89 @@ impl PptxContainer { Ok(slides) } + /// Parses all slides in the presentation with optimised multithreaded processing. + /// + /// This method uses Rayon for parallel processing by: + /// 1. preloading all necessary data + /// 2. performing CPU-intensive processing in parallel + /// 3. ensuring memory efficiency and thread safety + /// + /// # Returns + /// + /// * `Result>` - List of all parsed slides + pub fn parse_all_multi_threaded(&mut self) -> Result> { + // Zuerst extrahieren wir alle Daten, die wir brauchen, im Voraus + let slide_paths: Vec = self.slide_paths.to_vec(); + let config = self.config.clone(); + + // Vorausladen aller benötigten Daten, einschließlich Bilddaten + let mut complete_slide_data = Vec::with_capacity(self.slide_count as usize); + + for slide_path in &slide_paths { + // Slide XML-Daten laden + let slide_xml = self.read_file_from_archive(slide_path)?; + + // Beziehungsdatei für den Slide laden + let rels_path = self.get_slide_rels_path(slide_path); + let rels_data = self.read_file_from_archive(&rels_path).ok(); + + // Slide-Nummer extrahieren + let slide_number = Slide::extract_slide_number(slide_path).unwrap_or(0); + + // Parse Slide-Elemente + let elements = crate::parse_xml::parse_slide_xml(&slide_xml)?; + + // Bilder extrahieren + let mut images = Vec::new(); + let mut image_data = HashMap::new(); + + if self.config.extract_images { + // Bilder aus Beziehungen extrahieren + if let Some(ref rels_bytes) = rels_data { + images = crate::parse_rels::parse_slide_rels(rels_bytes)?; + } + + // Alle Bilddaten vorab laden + for img_ref in &images { + let img_path = PptxContainer::get_full_image_path(slide_path, &img_ref.target); + if let Ok(data) = self.read_file_from_archive(&img_path) { + image_data.insert(img_ref.id.clone(), data); + } + } + } + + // Alle vorbereiteten Daten in eine Struktur packen + complete_slide_data.push(( + slide_path.clone(), + slide_number, + elements, + images, + image_data + )); + } + + // Jetzt können wir parallel verarbeiten, ohne Zugriff auf self + let slides: Vec = complete_slide_data.into_par_iter() // Beachte: into_par_iter, um Ownership zu übertragen + .map(|(slide_path, slide_number, elements, images, image_data)| { + // Slide erstellen (direkt, ohne Container-Zugriff) + let mut slide = Slide::new( + slide_path, + slide_number, + elements, + images, + image_data, + config.clone(), // Klonen der (vermutlich kleinen) Config + ); + + // Bilder verknüpfen + slide.link_images(); + slide + }) + .collect(); + + Ok(slides) + } + pub fn iter_slides(&mut self) -> SlideIterator { SlideIterator::new(self) @@ -99,7 +186,7 @@ impl PptxContainer { /// // println!("Loaded first slide: {}", slide.slide_number); /// // } /// ``` - fn load_slide(&mut self, slide_path: &str) -> Result> { + pub fn load_slide(&mut self, slide_path: &str) -> Result> { // load xml data let slide_data = self.read_file_from_archive(slide_path)?; @@ -158,7 +245,7 @@ impl PptxContainer { /// /// This is an internal method used to extract individual files from the /// PPTX archive (which is essentially a ZIP file). - fn read_file_from_archive(&mut self, path: &str) -> Result> { + pub fn read_file_from_archive(&mut self, path: &str) -> Result> { let mut file = self.archive.by_name(path)?; let mut content = Vec::new(); file.read_to_end(&mut content)?; @@ -180,7 +267,7 @@ impl PptxContainer { /// ``` /// // For a slide path "ppt/slides/slide1.xml" /// // Returns "ppt/slides/_rels/slide1.xml.rels" - fn get_slide_rels_path(&self, slide_path: &str) -> String { + pub fn get_slide_rels_path(&self, slide_path: &str) -> String { let mut rels_path = slide_path.to_string(); if let Some(pos) = rels_path.rfind('/') { rels_path.insert_str(pos + 1, "_rels/"); @@ -189,7 +276,7 @@ impl PptxContainer { rels_path } - fn get_full_image_path(slide_path: &str, target: &str) -> String { + pub fn get_full_image_path(slide_path: &str, target: &str) -> String { if target.starts_with("../") { let adjusted_target = target.trim_start_matches("../"); format!("ppt/{}", adjusted_target) diff --git a/src/lib.rs b/src/lib.rs index 4e9e88f..80293e4 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -2,8 +2,8 @@ mod container; mod slide; mod types; mod constants; -mod parse_xml; -mod parse_rels; +pub mod parse_xml; +pub mod parse_rels; mod parser_config; pub use container::PptxContainer; @@ -40,6 +40,9 @@ pub enum Error { #[error("Conversion was not possible")] ConversionFailed, + #[error("Conversion was not possible")] + MultiThreadedConversionFailed, + #[error("Unbekannter Fehler")] Unknown, } From 4007750d0d499355c487004dab500a81d8260948 Mon Sep 17 00:00:00 2001 From: nilsk Date: Sun, 15 Jun 2025 02:10:08 +0200 Subject: [PATCH 2/2] improved overall performance of multithreaded approach --- src/container.rs | 98 +++++++++++++++++++++++------------------------- 1 file changed, 47 insertions(+), 51 deletions(-) diff --git a/src/container.rs b/src/container.rs index b6c3987..9243d1d 100644 --- a/src/container.rs +++ b/src/container.rs @@ -6,6 +6,7 @@ use std::{ io::Read, path::Path, }; +use std::sync::Arc; /// Holds the internal representation of a loaded PowerPoint (pptx) container. /// @@ -78,87 +79,82 @@ impl PptxContainer { Ok(slides) } - /// Parses all slides in the presentation with optimised multithreaded processing. + /// Parses all slides in the presentation with optimized multithreaded processing. /// /// This method uses Rayon for parallel processing by: - /// 1. preloading all necessary data - /// 2. performing CPU-intensive processing in parallel - /// 3. ensuring memory efficiency and thread safety + /// 1. Preloading all necessary data sequentially (I/O-bound operations) + /// 2. Performing CPU-intensive XML parsing in parallel + /// 3. Using shared references for thread-safe data access /// /// # Returns /// /// * `Result>` - List of all parsed slides pub fn parse_all_multi_threaded(&mut self) -> Result> { - // Zuerst extrahieren wir alle Daten, die wir brauchen, im Voraus - let slide_paths: Vec = self.slide_paths.to_vec(); + // Clone paths upfront to avoid holding reference to self + let slide_paths = self.slide_paths.clone(); let config = self.config.clone(); - - // Vorausladen aller benötigten Daten, einschließlich Bilddaten - let mut complete_slide_data = Vec::with_capacity(self.slide_count as usize); + let mut raw_data = Vec::with_capacity(slide_paths.len()); + let mut all_image_data = HashMap::new(); for slide_path in &slide_paths { - // Slide XML-Daten laden + // Read slide XML and relationships let slide_xml = self.read_file_from_archive(slide_path)?; - - // Beziehungsdatei für den Slide laden let rels_path = self.get_slide_rels_path(slide_path); let rels_data = self.read_file_from_archive(&rels_path).ok(); - - // Slide-Nummer extrahieren let slide_number = Slide::extract_slide_number(slide_path).unwrap_or(0); - // Parse Slide-Elemente - let elements = crate::parse_xml::parse_slide_xml(&slide_xml)?; - - // Bilder extrahieren - let mut images = Vec::new(); - let mut image_data = HashMap::new(); - - if self.config.extract_images { - // Bilder aus Beziehungen extrahieren - if let Some(ref rels_bytes) = rels_data { - images = crate::parse_rels::parse_slide_rels(rels_bytes)?; + // Preload images if enabled + let mut slide_images = Vec::new(); + if config.extract_images { + if let Some(ref data) = rels_data { + slide_images = crate::parse_rels::parse_slide_rels(data)?; } - // Alle Bilddaten vorab laden - for img_ref in &images { - let img_path = PptxContainer::get_full_image_path(slide_path, &img_ref.target); - if let Ok(data) = self.read_file_from_archive(&img_path) { - image_data.insert(img_ref.id.clone(), data); - } + for img_ref in &slide_images { + let path = PptxContainer::get_full_image_path(slide_path, &img_ref.target); + let data = self.read_file_from_archive(&path)?; + all_image_data.entry(img_ref.target.clone()).or_insert(data); } } - // Alle vorbereiteten Daten in eine Struktur packen - complete_slide_data.push(( - slide_path.clone(), - slide_number, - elements, - images, - image_data - )); + raw_data.push((slide_path.clone(), slide_number, slide_xml, slide_images)); } - // Jetzt können wir parallel verarbeiten, ohne Zugriff auf self - let slides: Vec = complete_slide_data.into_par_iter() // Beachte: into_par_iter, um Ownership zu übertragen - .map(|(slide_path, slide_number, elements, images, image_data)| { - // Slide erstellen (direkt, ohne Container-Zugriff) + // Share image data atomically across threads + let shared_image_data = Arc::new(all_image_data); + + // Parallel processing starts here (CPU-bound tasks) + let slides: Result> = raw_data + .into_par_iter() + .map(|(path, number, xml, images)| { + // Parse XML in parallel (CPU-intensive) + let elements = crate::parse_xml::parse_slide_xml(&xml)?; + + // Resolve image data from shared registry + let mut image_map = HashMap::new(); + if config.extract_images { + for img_ref in &images { + if let Some(data) = shared_image_data.get(&img_ref.target) { + image_map.insert(img_ref.id.clone(), data.clone()); + } + } + } + + // Build slide let mut slide = Slide::new( - slide_path, - slide_number, + path, + number, elements, images, - image_data, - config.clone(), // Klonen der (vermutlich kleinen) Config + image_map, + config.clone(), ); - - // Bilder verknüpfen slide.link_images(); - slide + Ok(slide) }) .collect(); - Ok(slides) + slides }