From 0d7f6a19994c45464db57db30a20ace2c6752d23 Mon Sep 17 00:00:00 2001 From: nilsk Date: Mon, 16 Jun 2025 14:31:24 +0200 Subject: [PATCH 1/3] added `ImageHandlingMode` to the `ParserConfig` --- README.md | 20 ++++++++++++++------ src/parser_config.rs | 39 +++++++++++++++++++++++++++++++++------ 2 files changed, 47 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 181c2e6..b916ffc 100644 --- a/README.md +++ b/README.md @@ -62,12 +62,20 @@ fn main() -> Result<(), Box> { ## Config Parameters -| Parameter | Type | Default | Description | -|-----------|--------|-------|------------------------------------------------------------------------------------------------------------------------------------------------------------| -| `extract_images` | `bool` | `true` | Whether images are extracted from slides or not | -| `compress_images` | `bool` | `true` | Whether images are compressed before encoding or not | -| `image_quality` | `u8` | `80` | Defines the image compression quality `(0-100)`. Higher values mean better quality but larger file sizes. | - +| Parameter | Type | Default | Description | +|--------------------------|-----------------------|---------------|-----------------------------------------------------------------------------------------------------------| +| `extract_images` | `bool` | `true` | Whether images are extracted from slides or not | +| `compress_images` | `bool` | `true` | Whether images are compressed before encoding or not | +| `image_quality` | `u8` | `80` | Defines the image compression quality `(0-100)`. Higher values mean better quality but larger file sizes. | +| `image_handling_mode` | `ImageHandlingMode` | `InMarkdown` | Determines how images are handled during content export | +
+ +#### Member of `ImageHandlingMode` +| Member | Description | +|-----------------------|-------------------------------------------------------------------------------------------------------| +| `InMarkdown` | Images are embedded directly in the Markdown output using standard syntax as `base64` data (`![]()`) | +| `ManuallyMarkdown` | Image handling is delegated to the user, requiring manual copying or referencing (as `base64`) | +| `ManuallyRaw` | Image handling is delegated to the user, requiring manual copying or referencing (as raw `binary`) | --- ## 🏗 Project Structure diff --git a/src/parser_config.rs b/src/parser_config.rs index 53eef57..7398af9 100644 --- a/src/parser_config.rs +++ b/src/parser_config.rs @@ -1,15 +1,32 @@ -/// Configuration options for the PPTX parser. +/// Determines how images are handled during content export. +/// +/// # Members +/// +/// | Member | Description | +/// |-----------------------|-------------------------------------------------------------------------------------------------------| +/// | `InMarkdown` | Images are embedded directly in the Markdown output using standard syntax as `base64` data (`![]()`) | +/// | `ManuallyMarkdown` | Image handling is delegated to the user, requiring manual copying or referencing (as `base64`) | +/// | `ManuallyRaw` | Image handling is delegated to the user, requiring manual copying or referencing (as raw `binary`) | +#[derive(Debug, Clone)] +pub enum ImageHandlingMode { + InMarkdown, + ManuallyMarkdown, + ManuallyRaw, +} + +/// Configuration options for the PPTX parser. /// /// Use [`ParserConfig::builder()`] to create a configuration instance. /// This allows you to customize only the desired fields while falling back to sensible defaults for the rest. /// /// # Configuration Options /// -/// | Parameter | Type | Default | Description | -/// |-----------|------|---------|-------------| -/// | `extract_images` | `bool` | `true` | Whether images are extracted from slides or not | -/// | `compress_images` | `bool` | `true` | Whether images are compressed before encoding or not | -/// | `image_quality` | `u8` | `80` | Compression level (0-100);
higher values retain more detail but increase file size | +/// | Parameter | Type | Default | Description | +/// |---------------------------|-----------------------|---------------|-------------------------------------------------------------------------------------------| +/// | `extract_images` | `bool` | `true` | Whether images are extracted from slides or not | +/// | `compress_images` | `bool` | `true` | Whether images are compressed before encoding or not | +/// | `image_quality` | `u8` | `80` | Compression level (0-100);
higher values retain more detail but increase file size | +/// | `image_handling_mode` | `ImageHandlingMode` | `InMarkdown` | Determines how images are handled during content export. | /// /// # Example /// @@ -25,6 +42,7 @@ pub struct ParserConfig { pub extract_images: bool, pub compress_images: bool, pub quality: u8, + pub image_handling_mode: ImageHandlingMode, } impl Default for ParserConfig { @@ -33,6 +51,7 @@ impl Default for ParserConfig { extract_images: true, compress_images: true, quality: 80, + image_handling_mode: ImageHandlingMode::InMarkdown, } } } @@ -51,6 +70,7 @@ pub struct ParserConfigBuilder { extract_images: Option, compress_images: Option, image_quality: Option, + image_handling_mode: Option, } impl ParserConfigBuilder { @@ -73,6 +93,12 @@ impl ParserConfigBuilder { self } + /// Specifies the mode for processing the image after its extracted + pub fn image_handling_mode(mut self, value: ImageHandlingMode) -> Self { + self.image_handling_mode = Some(value); + self + } + /// Builds the final [`ParserConfig`] instance, applying default values for any fields that were not set. pub fn build(self) -> ParserConfig { @@ -80,6 +106,7 @@ impl ParserConfigBuilder { extract_images: self.extract_images.unwrap_or(true), compress_images: self.compress_images.unwrap_or(true), quality: self.image_quality.unwrap_or(80), + image_handling_mode: self.image_handling_mode.unwrap_or(ImageHandlingMode::InMarkdown), } } } \ No newline at end of file From c1f87164f33b8afd10ae7859a0d63b369b26e3bb Mon Sep 17 00:00:00 2001 From: nilsk Date: Mon, 16 Jun 2025 14:37:55 +0200 Subject: [PATCH 2/3] applied unrelated `clippy` refactoring suggestions --- src/slide.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/slide.rs b/src/slide.rs index 81bbff2..f34e3b8 100644 --- a/src/slide.rs +++ b/src/slide.rs @@ -119,10 +119,10 @@ impl Slide { counters.resize(level + 1, 0); } - if level > previous_level { - counters[level] = 0; - } else if level < previous_level { - counters.truncate(level + 1); + match level.cmp(&previous_level) { + std::cmp::Ordering::Greater => counters[level] = 0, + std::cmp::Ordering::Less => counters.truncate(level + 1), + std::cmp::Ordering::Equal => {} } counters[level] += 1; @@ -296,7 +296,7 @@ mod tests { let raw_image = load_image_data("example-image.jpg"); - if let Some(compression_result) = slide.compress_image(&*raw_image) { + if let Some(compression_result) = slide.compress_image(&raw_image) { assert!(compression_result.len() < raw_image.len()); } else { panic!("Compression failed"); @@ -308,7 +308,7 @@ mod tests { let slide = mock_slide(); let raw_image = load_image_data("example-image.jpg"); - if let Some(compression_result) = slide.compress_image(&*raw_image) { + if let Some(compression_result) = slide.compress_image(&raw_image) { let result = image::load_from_memory(&compression_result); assert!(result.is_ok()); } else { From 8145e6288b409b347a49a62c98079d26f9246a8e Mon Sep 17 00:00:00 2001 From: nilsk Date: Mon, 16 Jun 2025 16:11:45 +0200 Subject: [PATCH 3/3] included logic to handle images manually --- CHANGELOG.md | 16 +++++ README.md | 13 ++-- examples/basic_usage.rs | 5 +- examples/image_extraction.rs | 11 ++-- examples/manual_image_extraction.rs | 95 +++++++++++++++++++++++++++++ src/lib.rs | 2 +- src/parser_config.rs | 26 ++++---- src/slide.rs | 51 +++++++++++++++- src/types.rs | 2 +- 9 files changed, 189 insertions(+), 32 deletions(-) create mode 100644 examples/manual_image_extraction.rs diff --git a/CHANGELOG.md b/CHANGELOG.md index bacc1cc..90a2dfe 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,22 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [0.3.0] - _unreleased_ + +### Added + +- Reworked the extraction of images by adding `ImageHandlingMode` to the `ParserConfig`. With this, users can decide to manually extract images and handle the logic [(#19)](https://github.com/nilskruthoff/pptx-parser/issues/19) +- New [example](https://github.com/nilskruthoff/pptx-parser/tree/master/examples) `manual_image_extraction.rs` to show how to handle images manually +- `ManualImage` struct to encapsulate data and meta data of images + +### Removed + +- `image_extraction` from [examples](https://github.com/nilskruthoff/pptx-parser/tree/master/examples) directory (replaced by `manual_image_extraction.rs`) + +### Changed + +--- + ## [0.2.0] - 2025-06-15 ### Added diff --git a/README.md b/README.md index b916ffc..8804b3f 100644 --- a/README.md +++ b/README.md @@ -64,18 +64,17 @@ fn main() -> Result<(), Box> { | Parameter | Type | Default | Description | |--------------------------|-----------------------|---------------|-----------------------------------------------------------------------------------------------------------| -| `extract_images` | `bool` | `true` | Whether images are extracted from slides or not | -| `compress_images` | `bool` | `true` | Whether images are compressed before encoding or not | +| `extract_images` | `bool` | `true` | Whether images are extracted from slides or not. If false, images can not be extracted manually either. | +| `compress_images` | `bool` | `true` | Whether images are compressed before encoding or not. Effects manually extracted images too. | | `image_quality` | `u8` | `80` | Defines the image compression quality `(0-100)`. Higher values mean better quality but larger file sizes. | | `image_handling_mode` | `ImageHandlingMode` | `InMarkdown` | Determines how images are handled during content export |
#### Member of `ImageHandlingMode` -| Member | Description | -|-----------------------|-------------------------------------------------------------------------------------------------------| -| `InMarkdown` | Images are embedded directly in the Markdown output using standard syntax as `base64` data (`![]()`) | -| `ManuallyMarkdown` | Image handling is delegated to the user, requiring manual copying or referencing (as `base64`) | -| `ManuallyRaw` | Image handling is delegated to the user, requiring manual copying or referencing (as raw `binary`) | +| Member | Description | +|-----------------|-------------------------------------------------------------------------------------------------------| +| `InMarkdown` | Images are embedded directly in the Markdown output using standard syntax as `base64` data (`![]()`) | +| `Manually` | Image handling is delegated to the user, requiring manual copying or referencing (as `base64`) | --- ## 🏗 Project Structure diff --git a/examples/basic_usage.rs b/examples/basic_usage.rs index 88e134c..903c4ee 100644 --- a/examples/basic_usage.rs +++ b/examples/basic_usage.rs @@ -4,7 +4,7 @@ //! //! Run with: cargo run --example basic_usage -use pptx_to_md::{PptxContainer, Result, ParserConfig}; +use pptx_to_md::{PptxContainer, Result, ParserConfig, ImageHandlingMode}; use std::env; use std::fs::File; use std::io::Write; @@ -25,6 +25,9 @@ fn main() -> Result<()> { // Use the config builder to build your config let config = ParserConfig::builder() .extract_images(true) + .compress_images(true) + .quality(75) + .image_handling_mode(ImageHandlingMode::InMarkdown) .build(); // Open the PPTX file diff --git a/examples/image_extraction.rs b/examples/image_extraction.rs index 3319f7c..17a9c2d 100644 --- a/examples/image_extraction.rs +++ b/examples/image_extraction.rs @@ -66,13 +66,10 @@ fn main() -> Result<()> { ext ); - match image_data { - Some(image_data) => { - fs::write(&output_path, image_data)?; - println!("Saved image to {}", output_path); - image_count += 1; - }, - None => {} + if let Some(image_data) = image_data { + fs::write(&output_path, image_data)?; + println!("Saved image to {}", output_path); + image_count += 1; } } } diff --git a/examples/manual_image_extraction.rs b/examples/manual_image_extraction.rs new file mode 100644 index 0000000..1e8e4b7 --- /dev/null +++ b/examples/manual_image_extraction.rs @@ -0,0 +1,95 @@ +//! Basic usage example for the pptx-to-md crate +//! +//! This example demonstrates how to open a PPTX file and convert all slides to Markdown. +//! +//! Run with: cargo run --example manual_image_extraction + +use pptx_to_md::{PptxContainer, Result, ParserConfig, ImageHandlingMode}; +use std::{env, fs}; +use std::fs::File; +use std::io::Write; +use std::path::Path; +use base64::Engine; +use base64::engine::general_purpose; + +fn main() -> Result<()> { + // Get the PPTX file path from command line arguments + let args: Vec = env::args().collect(); + let pptx_path = if args.len() > 1 { + &args[1] + } else { + eprintln!("Usage: cargo run --example manual_image_extraction "); + return Ok(()); + }; + + println!("Processing PPTX file: {}", pptx_path); + + // Use the config builder to build your config + let config = ParserConfig::builder() + .extract_images(true) + .compress_images(true) + .quality(75) + .image_handling_mode(ImageHandlingMode::Manually) + .build(); + + // Open the PPTX file + let mut container = PptxContainer::open(Path::new(pptx_path), config)?; + + // Parse all slides + let slides = container.parse_all()?; + + println!("Found {} slides", slides.len()); + + // create a new Markdown file + let mut md_file = File::create("output.md")?; + + // Create output directory + let output_dir = "extracted_images"; + fs::create_dir_all(output_dir)?; + + // Process slides one by one using the iterator + let mut image_count = 1; + + // Convert each slide to Markdown and save + for slide in slides { + if let Some(md_content) = slide.convert_to_md() { + writeln!(md_file, "{}", md_content).expect("Couldn't write to file"); + } + + // Manually load the base64 encoded image strings from the slide + if let Some(images) = slide.load_images_manually() { + for image in images { + + // Decode the base64 strings back to raw image data + let image_data = general_purpose::STANDARD.decode(image.base64_content.clone()).unwrap(); + + // Extract image extension if the image is not compressed, otherwise its always `.jpg` + let ext = slide.config.compress_images + .then(|| "jpg".to_string()) + .unwrap_or_else(|| slide.get_image_extension(&image.img_ref.target.clone())); + + // Construct a unique file name + let file_name = format!("slide{}_image{}_{}", slide.slide_number, image_count, &image.img_ref.id); + + // Save the image + let output_path = format!( + "{}/{}.{}", + output_dir, + &file_name, + ext + ); + fs::write(&output_path, image_data)?; + println!("Saved image to {}", output_path); + + // Write the image data into the Markdown file + writeln!(md_file, "![{}](data:image/{};base64,{})", file_name, ext, image.base64_content).expect("Couldn't write to file"); + + image_count += 1; + } + } + } + + println!("All slides converted successfully!"); + + Ok(()) +} \ No newline at end of file diff --git a/src/lib.rs b/src/lib.rs index 80293e4..a17a9a2 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -7,7 +7,7 @@ pub mod parse_rels; mod parser_config; pub use container::PptxContainer; -pub use parser_config::ParserConfig; +pub use parser_config::{ParserConfig, ImageHandlingMode}; pub use slide::Slide; pub use types::*; diff --git a/src/parser_config.rs b/src/parser_config.rs index 7398af9..d4ad65d 100644 --- a/src/parser_config.rs +++ b/src/parser_config.rs @@ -2,16 +2,14 @@ /// /// # Members /// -/// | Member | Description | -/// |-----------------------|-------------------------------------------------------------------------------------------------------| -/// | `InMarkdown` | Images are embedded directly in the Markdown output using standard syntax as `base64` data (`![]()`) | -/// | `ManuallyMarkdown` | Image handling is delegated to the user, requiring manual copying or referencing (as `base64`) | -/// | `ManuallyRaw` | Image handling is delegated to the user, requiring manual copying or referencing (as raw `binary`) | -#[derive(Debug, Clone)] +/// | Member | Description | +/// |-----------------------|-----------------------------------------------------------------------------------------------------------------------| +/// | `InMarkdown` | Images are embedded directly in the Markdown output using standard syntax as `base64` data (`![]()`) | +/// | `Manually` | Image handling is delegated to the user, requiring manual copying or referencing (as `base64` encoded string) | +#[derive(Debug, Clone, PartialEq, Eq)] pub enum ImageHandlingMode { InMarkdown, - ManuallyMarkdown, - ManuallyRaw, + Manually } /// Configuration options for the PPTX parser. @@ -21,12 +19,12 @@ pub enum ImageHandlingMode { /// /// # Configuration Options /// -/// | Parameter | Type | Default | Description | -/// |---------------------------|-----------------------|---------------|-------------------------------------------------------------------------------------------| -/// | `extract_images` | `bool` | `true` | Whether images are extracted from slides or not | -/// | `compress_images` | `bool` | `true` | Whether images are compressed before encoding or not | -/// | `image_quality` | `u8` | `80` | Compression level (0-100);
higher values retain more detail but increase file size | -/// | `image_handling_mode` | `ImageHandlingMode` | `InMarkdown` | Determines how images are handled during content export. | +/// | Parameter | Type | Default | Description | +/// |---------------------------|-----------------------|---------------|-----------------------------------------------------------------------------------------------------------| +/// | `extract_images` | `bool` | `true` | Whether images are extracted from slides or not. If false, images can not be extracted manually either. | +/// | `compress_images` | `bool` | `true` | Whether images are compressed before encoding or not. Effects manually extracted images too. | +/// | `image_quality` | `u8` | `80` | Compression level (0-100);
higher values retain more detail but increase file size | +/// | `image_handling_mode` | `ImageHandlingMode` | `InMarkdown` | Determines how images are handled during content export. | /// /// # Example /// diff --git a/src/slide.rs b/src/slide.rs index f34e3b8..85384da 100644 --- a/src/slide.rs +++ b/src/slide.rs @@ -4,6 +4,23 @@ use std::collections::HashMap; use std::io::Cursor; use std::path::Path; use image::ImageOutputFormat; +use crate::parser_config::ImageHandlingMode; + +/// Encapsulates images for manual extraction of images from slides +#[derive(Debug)] +pub struct ManualImage { + pub base64_content: String, + pub img_ref: ImageReference, +} + +impl ManualImage { + pub fn new(base64_content: String, img_ref: ImageReference) -> ManualImage { + Self { + base64_content, + img_ref, + } + } +} /// Represents a single slide extracted from a PowerPoint (pptx) file. /// @@ -91,10 +108,12 @@ impl Slide { slide_txt.push('\n'); }, SlideElement::Image(image_ref) => { + if self.config.image_handling_mode != ImageHandlingMode::InMarkdown { slide_txt.push('\n'); continue; } + if let Some(image_data) = self.image_data.get(&image_ref.id) { let image_data = self.config.compress_images .then(|| self.compress_image(image_data)) - .unwrap_or(Option::from(image_data.clone())); + .unwrap_or_else(|| Option::from(image_data.clone())); let base64_string = general_purpose::STANDARD.encode(image_data?); let image_name = &image_ref.target.split('/').last()?; @@ -226,6 +245,36 @@ impl Slide { None } } + + pub fn load_images_manually(&self) -> Option> { + let mut images: Vec = Vec::new(); + + let image_refs: Vec<&ImageReference> = self.elements + .iter() + .filter_map(|element| match element { + SlideElement::Image(ref img) => Some(img), + _ => None, + }) + .collect(); + + for image_ref in image_refs { + if let Some(image_data) = self.image_data.get(&image_ref.id) { + let image_data = self.config.compress_images + .then( | | self.compress_image(image_data)) + .unwrap_or_else(|| Option::from(image_data.clone())); + + let base64_str = general_purpose::STANDARD.encode(image_data?); + + let image = ManualImage::new( + base64_str, + image_ref.clone(), + ); + images.push(image); + } + } + + Some(images) + } } #[cfg(test)] diff --git a/src/types.rs b/src/types.rs index 12bf583..cc90445 100644 --- a/src/types.rs +++ b/src/types.rs @@ -17,7 +17,7 @@ pub enum SlideElement { Unknown, } -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct ImageReference { pub id: String, pub target: String,