diff --git a/Cargo.toml b/Cargo.toml index 47669c6..e133227 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -30,4 +30,5 @@ zip = "2.2.2" roxmltree = "0.20.0" thiserror = "2.0.11" base64 = "0.22.1" -image = "0.24.9" \ No newline at end of file +image = "0.24.9" +rayon = "1.10.0" \ No newline at end of file diff --git a/README.md b/README.md index c56c816..c45ddf4 100644 --- a/README.md +++ b/README.md @@ -32,7 +32,7 @@ fn main() -> Result<(), Box> { .extract_images(true) .build(); let pptx_container = PptxContainer::open(Path::new("presentation.pptx"), config)?; - let slides = container.parse_all()?; + let slides = container.parse_all()?; // or `parse_all_multi_threaded()?` for slide in slides { // Convert each slide into Markdown diff --git a/examples/performance_test.rs b/examples/performance_test.rs new file mode 100644 index 0000000..b33a06f --- /dev/null +++ b/examples/performance_test.rs @@ -0,0 +1,275 @@ +//! Performance benchmark example for the pptx-to-md crate +//! +//! This example measures and compares the performance of different parsing approaches: +//! - Single-threaded parsing +//! - Single-threading wih streaming +//! - Optimized multithreaded parsing with Rayon +//! +//! It also provides timing for individual operations to identify bottlenecks. +//! +//! Run with: cargo run --release --example performance_test [iterations] + +use pptx_to_md::{ParserConfig, PptxContainer, Result}; +use rayon::prelude::*; +use std::env; +use std::path::Path; +use std::time::{Duration, Instant}; + +struct Benchmark { + name: String, + start_time: Instant, + results: Vec, +} + +impl Benchmark { + fn new(name: &str) -> Self { + println!("Starting benchmark: {}", name); + Benchmark { + name: name.to_string(), + start_time: Instant::now(), + results: Vec::new(), + } + } + + fn measure(&mut self, mut f: F) -> T + where + F: FnMut() -> T, + { + let start = Instant::now(); + let result = f(); + let duration = start.elapsed(); + self.results.push(duration); + println!(" Operation took: {:?}", duration); + result + } + + fn report(&self) { + if self.results.is_empty() { + println!("No measurements for {}", self.name); + return; + } + + let total = self.start_time.elapsed(); + let count = self.results.len(); + let sum: Duration = self.results.iter().sum(); + let avg = sum / count as u32; + let min = self.results.iter().min().unwrap(); + let max = self.results.iter().max().unwrap(); + + println!("\nBenchmark Results for {}", self.name); + println!("----------------------------"); + println!("Total time: {:?}", total); + println!("Operations: {}", count); + println!("Average time per operation: {:?}", avg); + println!("Min time: {:?}", min); + println!("Max time: {:?}", max); + println!("----------------------------\n"); + } +} + +fn main() -> Result<()> { + // Get the PPTX file path and optional iteration count from command line arguments + let args: Vec = env::args().collect(); + let pptx_path = if args.len() > 1 { + &args[1] + } else { + eprintln!("Usage: cargo run --example performance_test [iterations]"); + return Ok(()); + }; + + let iterations = if args.len() > 2 { + args[2].parse().unwrap_or(5) + } else { + 10 // Default to 10 iterations + }; + + println!("Performance testing with {} iterations on: {}", iterations, pptx_path); + + + + // =========== Single-threaded Approach =========== + let mut single_thread_bench = Benchmark::new("Single-threaded parsing"); + + let mut total_slides = 0; + + for i in 0..iterations { + println!("\nIteration {} (Single-threaded)", i + 1); + + // Measure container creation + let mut container = single_thread_bench.measure(|| { + let config = ParserConfig::builder() + .extract_images(true) + .build(); + PptxContainer::open(Path::new(pptx_path), config).expect("Failed to open PPTX") + }); + + println!(" Found {} slides in the presentation", container.slide_count); + + // Measure parsing + let slides = single_thread_bench.measure(|| { + container.parse_all().expect("Failed to parse slides") + }); + + // Measure conversion + let _md_content = single_thread_bench.measure(|| { + slides.iter() + .filter_map(|slide| slide.convert_to_md()) + .collect::>() + }); + + total_slides += slides.len(); + } + + single_thread_bench.report(); + println!("Average slides per presentation: {}", total_slides / iterations); + + + + // =========== Single-threaded Streamed Approach =========== + let mut single_thread_streamed_bench = Benchmark::new("Single-threaded streamed parsing"); + + total_slides = 0; + + for i in 0..iterations { + println!("\nIteration {} (Single-threaded streamed)", i + 1); + + // Measure container creation + let mut container = single_thread_streamed_bench.measure(|| { + let config = ParserConfig::builder() + .extract_images(true) + .build(); + PptxContainer::open(Path::new(pptx_path), config).expect("Failed to open PPTX") + }); + + println!(" Found {} slides in the presentation", container.slide_count); + + // Zähle die Slides im Voraus für die statistische Auswertung + let expected_slides = container.slide_count; + + // Measure slide processing (including parsing and conversion) + let slides_processed = single_thread_streamed_bench.measure(|| { + let mut processed = 0; + + // Process slides one by one using the iterator + for slide_result in container.iter_slides() { + match slide_result { + Ok(slide) => { + // Konvertiere den Slide zu Markdown + let _md_content = slide.convert_to_md(); + processed += 1; + }, + Err(e) => { + eprintln!("Error processing slide: {:?}", e); + } + } + } + + processed + }); + + println!(" Processed {} out of {} slides", slides_processed, expected_slides); + total_slides += slides_processed; + } + + single_thread_streamed_bench.report(); + println!("Average slides per presentation: {}", total_slides / iterations); + + + + // =========== Optimized Multi-threaded Approach =========== + let mut optimized_multi_thread_bench = Benchmark::new("Optimized Multi-threaded parsing"); + + total_slides = 0; + + for i in 0..iterations { + println!("\nIteration {} (Optimized Multi-threaded)", i + 1); + + // Container öffnen mit der gewünschten Konfiguration + let mut container = optimized_multi_thread_bench.measure(|| { + let config = ParserConfig::builder() + .extract_images(true) + .build(); + PptxContainer::open(Path::new(pptx_path), config).expect("Failed to open PPTX") + }); + + println!(" Found {} slides in the presentation", container.slide_count); + + // Verwende die neue optimierte Multi-Threading-Methode + let slides = optimized_multi_thread_bench.measure(|| { + container.parse_all_multi_threaded().expect("Failed to parse slides") + }); + + println!(" Successfully processed {} slides", slides.len()); + + // Parallel zu Markdown konvertieren (bleibt unverändert) + let _md_content = optimized_multi_thread_bench.measure(|| { + slides.par_iter() + .filter_map(|slide| slide.convert_to_md()) + .collect::>() + }); + + total_slides += slides.len(); + } + + optimized_multi_thread_bench.report(); + println!("Average slides per presentation: {}", total_slides / iterations); + + // =========== Performance Comparison =========== + if !single_thread_bench.results.is_empty() && + !single_thread_streamed_bench.results.is_empty() && + !optimized_multi_thread_bench.results.is_empty() { + + let single_avg: Duration = single_thread_bench.results.iter().sum::() / + single_thread_bench.results.len() as u32; + let single_streamed_avg: Duration = single_thread_streamed_bench.results.iter().sum::() / + single_thread_streamed_bench.results.len() as u32; + let optimized_multi_avg: Duration = optimized_multi_thread_bench.results.iter().sum::() / + optimized_multi_thread_bench.results.len() as u32; + + println!("\nPerformance Comparison"); + println!("====================="); + println!("Single-threaded average: {:?}", single_avg); + println!("Single-threaded streaming average: {:?}", single_streamed_avg); + println!("Optimized multi-threaded average: {:?}", optimized_multi_avg); + + // Compare single-threaded vs single-threaded streaming + if single_avg > single_streamed_avg { + let speedup = single_avg.as_secs_f64() / single_streamed_avg.as_secs_f64(); + println!("Single-threaded streaming is {:.2}x faster than single-threaded", speedup); + } else { + let slowdown = single_streamed_avg.as_secs_f64() / single_avg.as_secs_f64(); + println!("Single-threaded streaming is {:.2}x slower than single-threaded", slowdown); + } + + // Compare single-threaded vs optimized multithreaded + if single_avg > optimized_multi_avg { + let speedup = single_avg.as_secs_f64() / optimized_multi_avg.as_secs_f64(); + println!("Optimized multi-threaded is {:.2}x faster than single-threaded", speedup); + } else { + let slowdown = optimized_multi_avg.as_secs_f64() / single_avg.as_secs_f64(); + println!("Optimized multi-threaded is {:.2}x slower than single-threaded", slowdown); + } + + // Compare single-threaded streaming vs optimized multithreaded + if single_streamed_avg > optimized_multi_avg { + let speedup = single_streamed_avg.as_secs_f64() / optimized_multi_avg.as_secs_f64(); + println!("Optimized multi-threaded is {:.2}x faster than single-threaded streaming", speedup); + } else { + let slowdown = optimized_multi_avg.as_secs_f64() / single_streamed_avg.as_secs_f64(); + println!("Optimized multi-threaded is {:.2}x slower than single-threaded streaming", slowdown); + } + + // Determine the overall fastest approach + let fastest_approach = if single_avg <= single_streamed_avg && single_avg <= optimized_multi_avg { + "Single-threaded" + } else if single_streamed_avg <= single_avg && single_streamed_avg <= optimized_multi_avg { + "Single-threaded streaming" + } else { + "Optimized multi-threaded" + }; + + println!("\nOverall result: {} approach is the fastest for this workload.", fastest_approach); + } + + Ok(()) +} \ No newline at end of file diff --git a/src/container.rs b/src/container.rs index 7c74c67..9243d1d 100644 --- a/src/container.rs +++ b/src/container.rs @@ -1,10 +1,12 @@ use super::{Result, Slide}; +use crate::parser_config::ParserConfig; +use rayon::prelude::*; use std::{ collections::HashMap, io::Read, path::Path, }; -use crate::parser_config::ParserConfig; +use std::sync::Arc; /// Holds the internal representation of a loaded PowerPoint (pptx) container. /// @@ -12,9 +14,10 @@ use crate::parser_config::ParserConfig; /// directly from a loaded pptx file. It parses and stores XML slides content, /// relationships (`rels`) files, and associated resources such as images. pub struct PptxContainer { - config: ParserConfig, + pub config: ParserConfig, archive: zip::ZipArchive, - slide_paths: Vec, + pub slide_paths: Vec, + pub slide_count: u32, } impl PptxContainer { @@ -41,6 +44,7 @@ impl PptxContainer { let mut archive = zip::ZipArchive::new(file)?; let mut slide_paths: Vec = Vec::new(); + let mut slide_count = 0; for i in 0..archive.len() { let file = archive.by_index(i)?; @@ -48,12 +52,13 @@ impl PptxContainer { if name.starts_with("ppt/slides/slide") && name.ends_with(".xml") { slide_paths.push(name); + slide_count += 1; } } slide_paths.sort(); - Ok(Self { archive, slide_paths, config }) + Ok(Self { archive, slide_paths, config, slide_count }) } /// Parses the data of all slides for each path present in the containers' `slide_path` vector. @@ -74,6 +79,84 @@ impl PptxContainer { Ok(slides) } + /// Parses all slides in the presentation with optimized multithreaded processing. + /// + /// This method uses Rayon for parallel processing by: + /// 1. Preloading all necessary data sequentially (I/O-bound operations) + /// 2. Performing CPU-intensive XML parsing in parallel + /// 3. Using shared references for thread-safe data access + /// + /// # Returns + /// + /// * `Result>` - List of all parsed slides + pub fn parse_all_multi_threaded(&mut self) -> Result> { + // Clone paths upfront to avoid holding reference to self + let slide_paths = self.slide_paths.clone(); + let config = self.config.clone(); + let mut raw_data = Vec::with_capacity(slide_paths.len()); + let mut all_image_data = HashMap::new(); + + for slide_path in &slide_paths { + // Read slide XML and relationships + let slide_xml = self.read_file_from_archive(slide_path)?; + let rels_path = self.get_slide_rels_path(slide_path); + let rels_data = self.read_file_from_archive(&rels_path).ok(); + let slide_number = Slide::extract_slide_number(slide_path).unwrap_or(0); + + // Preload images if enabled + let mut slide_images = Vec::new(); + if config.extract_images { + if let Some(ref data) = rels_data { + slide_images = crate::parse_rels::parse_slide_rels(data)?; + } + + for img_ref in &slide_images { + let path = PptxContainer::get_full_image_path(slide_path, &img_ref.target); + let data = self.read_file_from_archive(&path)?; + all_image_data.entry(img_ref.target.clone()).or_insert(data); + } + } + + raw_data.push((slide_path.clone(), slide_number, slide_xml, slide_images)); + } + + // Share image data atomically across threads + let shared_image_data = Arc::new(all_image_data); + + // Parallel processing starts here (CPU-bound tasks) + let slides: Result> = raw_data + .into_par_iter() + .map(|(path, number, xml, images)| { + // Parse XML in parallel (CPU-intensive) + let elements = crate::parse_xml::parse_slide_xml(&xml)?; + + // Resolve image data from shared registry + let mut image_map = HashMap::new(); + if config.extract_images { + for img_ref in &images { + if let Some(data) = shared_image_data.get(&img_ref.target) { + image_map.insert(img_ref.id.clone(), data.clone()); + } + } + } + + // Build slide + let mut slide = Slide::new( + path, + number, + elements, + images, + image_map, + config.clone(), + ); + slide.link_images(); + Ok(slide) + }) + .collect(); + + slides + } + pub fn iter_slides(&mut self) -> SlideIterator { SlideIterator::new(self) @@ -99,7 +182,7 @@ impl PptxContainer { /// // println!("Loaded first slide: {}", slide.slide_number); /// // } /// ``` - fn load_slide(&mut self, slide_path: &str) -> Result> { + pub fn load_slide(&mut self, slide_path: &str) -> Result> { // load xml data let slide_data = self.read_file_from_archive(slide_path)?; @@ -158,7 +241,7 @@ impl PptxContainer { /// /// This is an internal method used to extract individual files from the /// PPTX archive (which is essentially a ZIP file). - fn read_file_from_archive(&mut self, path: &str) -> Result> { + pub fn read_file_from_archive(&mut self, path: &str) -> Result> { let mut file = self.archive.by_name(path)?; let mut content = Vec::new(); file.read_to_end(&mut content)?; @@ -180,7 +263,7 @@ impl PptxContainer { /// ``` /// // For a slide path "ppt/slides/slide1.xml" /// // Returns "ppt/slides/_rels/slide1.xml.rels" - fn get_slide_rels_path(&self, slide_path: &str) -> String { + pub fn get_slide_rels_path(&self, slide_path: &str) -> String { let mut rels_path = slide_path.to_string(); if let Some(pos) = rels_path.rfind('/') { rels_path.insert_str(pos + 1, "_rels/"); @@ -189,7 +272,7 @@ impl PptxContainer { rels_path } - fn get_full_image_path(slide_path: &str, target: &str) -> String { + pub fn get_full_image_path(slide_path: &str, target: &str) -> String { if target.starts_with("../") { let adjusted_target = target.trim_start_matches("../"); format!("ppt/{}", adjusted_target) diff --git a/src/lib.rs b/src/lib.rs index 4e9e88f..80293e4 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -2,8 +2,8 @@ mod container; mod slide; mod types; mod constants; -mod parse_xml; -mod parse_rels; +pub mod parse_xml; +pub mod parse_rels; mod parser_config; pub use container::PptxContainer; @@ -40,6 +40,9 @@ pub enum Error { #[error("Conversion was not possible")] ConversionFailed, + #[error("Conversion was not possible")] + MultiThreadedConversionFailed, + #[error("Unbekannter Fehler")] Unknown, }