diff --git a/CHANGELOG.md b/CHANGELOG.md index 90a2dfe..e5c2884 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Reworked the extraction of images by adding `ImageHandlingMode` to the `ParserConfig`. With this, users can decide to manually extract images and handle the logic [(#19)](https://github.com/nilskruthoff/pptx-parser/issues/19) - New [example](https://github.com/nilskruthoff/pptx-parser/tree/master/examples) `manual_image_extraction.rs` to show how to handle images manually - `ManualImage` struct to encapsulate data and meta data of images +- `ImageHandlingMode::Save` to save images in a given output path and adding context to the Markdown file [(#20)](https://github.com/nilskruthoff/pptx-parser/issues/20) ### Removed @@ -19,6 +20,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changed +- Updated [README.md](https://github.com/nilskruthoff/pptx-parser/blob/master/README.md) to document new `ParserConfig` parameters + --- ## [0.2.0] - 2025-06-15 @@ -39,4 +42,4 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changed -- [README.md](https://github.com/nilskruthoff/pptx-parser/blob/master/README.md) updated to show the latest working examples and features \ No newline at end of file +- Updated [README.md](https://github.com/nilskruthoff/pptx-parser/blob/master/README.md) to show the latest working examples and features \ No newline at end of file diff --git a/Cargo.toml b/Cargo.toml index 6cfc83b..80d0b42 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -18,8 +18,12 @@ name = "memory_efficient_streaming" path = "examples/memory_efficient_streaming.rs" [[example]] -name = "image_extraction" -path = "examples/image_extraction.rs" +name = "manual_image_extraction" +path = "examples/manual_image_extraction.rs" + +[[example]] +name = "save_images" +path = "examples/save_images.rs" [[example]] name = "slide_elements" diff --git a/README.md b/README.md index 8804b3f..9c6644e 100644 --- a/README.md +++ b/README.md @@ -62,19 +62,23 @@ fn main() -> Result<(), Box> { ## Config Parameters -| Parameter | Type | Default | Description | -|--------------------------|-----------------------|---------------|-----------------------------------------------------------------------------------------------------------| -| `extract_images` | `bool` | `true` | Whether images are extracted from slides or not. If false, images can not be extracted manually either. | -| `compress_images` | `bool` | `true` | Whether images are compressed before encoding or not. Effects manually extracted images too. | -| `image_quality` | `u8` | `80` | Defines the image compression quality `(0-100)`. Higher values mean better quality but larger file sizes. | -| `image_handling_mode` | `ImageHandlingMode` | `InMarkdown` | Determines how images are handled during content export | +| Parameter | Type | Default | Description | +|------------------------|-----------------------|---------------|-----------------------------------------------------------------------------------------------------------| +| `extract_images` | `bool` | `true` | Whether images are extracted from slides or not. If false, images can not be extracted manually either. | +| `compress_images` | `bool` | `true` | Whether images are compressed before encoding or not. Effects manually extracted images too. | +| `image_quality` | `u8` | `80` | Defines the image compression quality `(0-100)`. Higher values mean better quality but larger file sizes. | +| `image_handling_mode` | `ImageHandlingMode` | `InMarkdown` | Determines how images are handled during content export | +| `image_output_path` | `Option` | `None` | Output directory path for `ImageHandlingMode::Save` (mandatory for saving mode) | +
#### Member of `ImageHandlingMode` -| Member | Description | -|-----------------|-------------------------------------------------------------------------------------------------------| -| `InMarkdown` | Images are embedded directly in the Markdown output using standard syntax as `base64` data (`![]()`) | -| `Manually` | Image handling is delegated to the user, requiring manual copying or referencing (as `base64`) | +| Member | Description | +|---------------|-------------------------------------------------------------------------------------------------------| +| `InMarkdown` | Images are embedded directly in the Markdown output using standard syntax as `base64` data (`![]()`) | +| `Manually` | Image handling is delegated to the user, requiring manual copying or referencing (as `base64`) | +| `Save` | Images will be saved in a provided output directory and integrated using standard syntax (`![]()`) | + --- ## 🏗 Project Structure @@ -87,8 +91,10 @@ pptx-to-md/ ├── LICENSE-APACHE ├── examples/ # Simple examples to present the usage of this crate │ ├── basic_usage.rs -│ ├── image_extractions.rs +│ ├── manual_image_extraction.rs │ ├── memory_efficient_streaming.rs +│ ├── performance_tests.rs +│ ├── save_images.rs │ └── slide_elements.rs ├── src/ │ ├── lib.rs # Public API @@ -111,7 +117,7 @@ Include the following line in your Cargo.toml dependencies section: ```toml [dependencies] -pptx-to-md = "0.1.2" # replace with the current version +pptx-to-md = "0.3.0" # replace with the current version ``` --- @@ -122,5 +128,4 @@ and [Apache 2.0-Licence](https://github.com/nilskruthoff/pptx-parser/blob/master Feel free to contribute or suggest improvements! ---- - +--- \ No newline at end of file diff --git a/examples/image_extraction.rs b/examples/image_extraction.rs deleted file mode 100644 index 17a9c2d..0000000 --- a/examples/image_extraction.rs +++ /dev/null @@ -1,87 +0,0 @@ -//! Image extraction example for the pptx-to-md crate -//! -//! This example demonstrates how to extract images from a PPTX file. -//! -//! Run with: cargo run --example image_extraction - -use std::env; -use std::path::Path; -use std::fs; -use pptx_to_md::{PptxContainer, SlideElement, Result, ParserConfig}; - -fn main() -> Result<()> { - // Get the PPTX file path from command line arguments - let args: Vec = env::args().collect(); - let pptx_path = if args.len() > 1 { - &args[1] - } else { - eprintln!("Usage: cargo run --example image_extraction "); - return Ok(()); - }; - - println!("Extracting images from PPTX file: {}", pptx_path); - - // Use the config builder to build your config - let config = ParserConfig::builder() - .extract_images(true) - .compress_images(true) - .quality(75) - .build(); - - // Open the PPTX file with the streaming API - let mut streamer = PptxContainer::open(Path::new(pptx_path), config)?; - - // Create output directory - let output_dir = "extracted_images"; - fs::create_dir_all(output_dir)?; - - // Process slides one by one using the iterator - let mut image_count = 0; - - for slide_result in streamer.iter_slides() { - match slide_result { - Ok(slide) => { - // Find image elements in the slide - for (element_idx, element) in slide.elements.iter().enumerate() { - if let SlideElement::Image(img_ref) = element { - // Get image data from the slide's image_data HashMap - if let Some(image_data) = slide.image_data.get(&img_ref.id) { - // Image data will be compressed if the config is true, otherwise its unchanged - let image_data = slide.config.compress_images - .then(|| slide.compress_image(image_data)) - .unwrap_or(Option::from(image_data.clone())); - - // Extract image extension if the image is not compressed, otherwise its always `.jpg` - let ext = slide.config.compress_images - .then(|| "jpg".to_string()) - .unwrap_or_else(|| slide.get_image_extension(&img_ref.target.clone())); - - // Save the image - let output_path = format!( - "{}/slide{}_image{}_{}.{}", - output_dir, - slide.slide_number, - element_idx, - &img_ref.id, - ext - ); - - if let Some(image_data) = image_data { - fs::write(&output_path, image_data)?; - println!("Saved image to {}", output_path); - image_count += 1; - } - } - } - } - }, - Err(e) => { - eprintln!("Error processing slide: {:?}", e); - } - } - } - - println!("Extracted {} images successfully!", image_count); - - Ok(()) -} \ No newline at end of file diff --git a/examples/save_images.rs b/examples/save_images.rs new file mode 100644 index 0000000..30e389c --- /dev/null +++ b/examples/save_images.rs @@ -0,0 +1,55 @@ +//! Basic usage example for the pptx-to-md crate +//! +//! This example demonstrates how to open a PPTX file and convert all slides to Markdown. +//! +//! Run with: cargo run --example save_images + +use pptx_to_md::{ImageHandlingMode, ParserConfig, PptxContainer, Result}; +use std::fs::File; +use std::io::Write; +use std::path::{Path, PathBuf}; +use std::env; + +fn main() -> Result<()> { + // Get the PPTX file path from command line arguments and provide the mandatory output path + let args: Vec = env::args().collect(); + let pptx_path = if args.len() > 1 { + &args[1] + } else { + eprintln!("Usage: cargo run --example save_images "); + return Ok(()); + }; + + println!("Processing PPTX file: {}", pptx_path); + + // Use the config builder to build your config + let config = ParserConfig::builder() + .extract_images(true) + .compress_images(true) + .quality(75) + .image_handling_mode(ImageHandlingMode::Save) + .image_output_path(PathBuf::from("C:/Users/nilsk/Downloads/extracted_images")) + .build(); + + // Open the PPTX file + let mut container = PptxContainer::open(Path::new(pptx_path), config)?; + + // Parse all slides + let slides = container.parse_all()?; + + println!("Found {} slides", slides.len()); + + // create a new Markdown file + let mut md_file = File::create("output.md")?; + + // Convert each slide to Markdown and save the images automatically + for slide in slides { + if let Some(md_content) = slide.convert_to_md() { + writeln!(md_file, "{}", md_content).expect("Couldn't write to file"); + } + } + + println!("All slides converted successfully!"); + + Ok(()) +} \ No newline at end of file diff --git a/src/parser_config.rs b/src/parser_config.rs index d4ad65d..903bdf9 100644 --- a/src/parser_config.rs +++ b/src/parser_config.rs @@ -1,4 +1,6 @@ -/// Determines how images are handled during content export. +use std::path::PathBuf; + +/// Determines how images are handled during content export. /// /// # Members /// @@ -6,10 +8,12 @@ /// |-----------------------|-----------------------------------------------------------------------------------------------------------------------| /// | `InMarkdown` | Images are embedded directly in the Markdown output using standard syntax as `base64` data (`![]()`) | /// | `Manually` | Image handling is delegated to the user, requiring manual copying or referencing (as `base64` encoded string) | +/// | `Save` | Images will be saved in a provided output directory and integrated using standard syntax (`![]()`) | #[derive(Debug, Clone, PartialEq, Eq)] pub enum ImageHandlingMode { InMarkdown, - Manually + Manually, + Save, } /// Configuration options for the PPTX parser. @@ -18,21 +22,27 @@ pub enum ImageHandlingMode { /// This allows you to customize only the desired fields while falling back to sensible defaults for the rest. /// /// # Configuration Options -/// +/// /// | Parameter | Type | Default | Description | /// |---------------------------|-----------------------|---------------|-----------------------------------------------------------------------------------------------------------| -/// | `extract_images` | `bool` | `true` | Whether images are extracted from slides or not. If false, images can not be extracted manually either. | -/// | `compress_images` | `bool` | `true` | Whether images are compressed before encoding or not. Effects manually extracted images too. | +/// | `extract_images` | `bool` | `true` | Whether images are extracted from slides or not. If false, images can not be extracted manually either | +/// | `compress_images` | `bool` | `true` | Whether images are compressed before encoding or not. Effects manually extracted images too | /// | `image_quality` | `u8` | `80` | Compression level (0-100);
higher values retain more detail but increase file size | -/// | `image_handling_mode` | `ImageHandlingMode` | `InMarkdown` | Determines how images are handled during content export. | +/// | `image_handling_mode` | `ImageHandlingMode` | `InMarkdown` | Determines how images are handled during content export | +/// | `image_output_path` | `Option` | `None` | Output directory path for `ImageHandlingMode::Save` (mandatory for the saving mode) | /// /// # Example /// /// ``` -/// use pptx_to_md::ParserConfig; +/// use std::path::PathBuf; +/// use pptx_to_md::{ImageHandlingMode, ParserConfig}; /// /// let config = ParserConfig::builder() /// .extract_images(true) +/// .compress_images(true) +/// .quality(75) +/// .image_handling_mode(ImageHandlingMode::Save) +/// .image_output_path(PathBuf::from("/path/to/output/dir/")) /// .build(); /// ``` #[derive(Debug, Clone)] @@ -41,15 +51,17 @@ pub struct ParserConfig { pub compress_images: bool, pub quality: u8, pub image_handling_mode: ImageHandlingMode, + pub image_output_path: Option, } impl Default for ParserConfig { fn default() -> Self { - Self { + Self { extract_images: true, compress_images: true, quality: 80, image_handling_mode: ImageHandlingMode::InMarkdown, + image_output_path: None, } } } @@ -69,6 +81,7 @@ pub struct ParserConfigBuilder { compress_images: Option, image_quality: Option, image_handling_mode: Option, + image_output_path: Option, } impl ParserConfigBuilder { @@ -77,26 +90,34 @@ impl ParserConfigBuilder { self.extract_images = Some(value); self } - + /// Sets weather images should be compressed before encoded to base64 or not pub fn compress_images(mut self, value: bool) -> Self { self.compress_images = Some(value); self } - + /// Specifies the desired image quality where `100` is the original quality and `50` means half the quality /// The lower the quality, the smaller the file size of the output image will be pub fn quality(mut self, value: u8) -> Self { self.image_quality = Some(value); self } - + /// Specifies the mode for processing the image after its extracted pub fn image_handling_mode(mut self, value: ImageHandlingMode) -> Self { self.image_handling_mode = Some(value); self } - + + /// Specifies the output directory for the [`ImageHandlingMode::Save`] + pub fn image_output_path

(mut self, path: P) -> Self + where + P: Into, + { + self.image_output_path = Some(path.into()); + self + } /// Builds the final [`ParserConfig`] instance, applying default values for any fields that were not set. pub fn build(self) -> ParserConfig { @@ -105,6 +126,7 @@ impl ParserConfigBuilder { compress_images: self.compress_images.unwrap_or(true), quality: self.image_quality.unwrap_or(80), image_handling_mode: self.image_handling_mode.unwrap_or(ImageHandlingMode::InMarkdown), + image_output_path: self.image_output_path, } } } \ No newline at end of file diff --git a/src/slide.rs b/src/slide.rs index 85384da..fc2fb34 100644 --- a/src/slide.rs +++ b/src/slide.rs @@ -1,10 +1,11 @@ -use crate::{ImageReference, ParserConfig, SlideElement}; +use crate::parser_config::ImageHandlingMode; +use crate::{ImageReference, ParserConfig, SlideElement}; use base64::{engine::general_purpose, Engine as _}; +use image::ImageOutputFormat; use std::collections::HashMap; +use std::fs; use std::io::Cursor; -use std::path::Path; -use image::ImageOutputFormat; -use crate::parser_config::ImageHandlingMode; +use std::path::{Path, PathBuf}; /// Encapsulates images for manual extraction of images from slides #[derive(Debug)] @@ -73,7 +74,8 @@ impl Slide { pub fn convert_to_md(&self) -> Option { let mut slide_txt = String::new(); slide_txt.push_str(format!("\n\n", self.slide_number).as_str()); - + let mut image_count = 0; + for element in &self.elements { match element { SlideElement::Text(text) => { @@ -108,18 +110,51 @@ impl Slide { slide_txt.push('\n'); }, SlideElement::Image(image_ref) => { - if self.config.image_handling_mode != ImageHandlingMode::InMarkdown { slide_txt.push('\n'); continue; } - - if let Some(image_data) = self.image_data.get(&image_ref.id) { - let image_data = self.config.compress_images - .then(|| self.compress_image(image_data)) - .unwrap_or_else(|| Option::from(image_data.clone())); - - let base64_string = general_purpose::STANDARD.encode(image_data?); - let image_name = &image_ref.target.split('/').last()?; - let file_ext = &image_name.split('.').last()?; - - slide_txt.push_str(format!("![{}](data:image/{};base64,{})", image_name, file_ext, base64_string).as_str()); + match self.config.image_handling_mode { + ImageHandlingMode::InMarkdown => { + if let Some(image_data) = self.image_data.get(&image_ref.id) { + let image_data = self.config.compress_images + .then(|| self.compress_image(image_data)) + .unwrap_or_else(|| Option::from(image_data.clone())); + + let base64_string = general_purpose::STANDARD.encode(image_data?); + let image_name = &image_ref.target.split('/').last()?; + let file_ext = &image_name.split('.').last()?; + + slide_txt.push_str(format!("![{}](data:image/{};base64,{})", image_name, file_ext, base64_string).as_str()); + } + } + ImageHandlingMode::Save => { + if let Some(image_data) = self.image_data.get(&image_ref.id) { + let image_data = self.config.compress_images + .then(|| self.compress_image(image_data)) + .unwrap_or_else(|| Option::from(image_data.clone())); + + let ext = self.config.compress_images + .then(|| "jpg".to_string()) + .unwrap_or_else(|| self.get_image_extension(&image_ref.target.clone())); + + let output_dir = self.config + .image_output_path + .clone() + .unwrap_or_else(|| PathBuf::from(".")); + + let _ = fs::create_dir_all(&output_dir); + + let mut image_path = output_dir.clone(); + let file_name = format!("slide{}_image{}_{}.{}", self.slide_number, image_count + 1, &image_ref.id, ext); + image_path.push(&file_name); + + let _ = fs::write(&image_path, image_data?); + + let abs_file_url = self.path_to_file_url(&image_path); + let html_link = format!(r#"{file_name}"#, abs_file_url?); + image_count += 1; + slide_txt.push_str(&html_link); + slide_txt.push('\n'); + } + } + ImageHandlingMode::Manually => { slide_txt.push('\n'); continue; } } slide_txt.push('\n'); } @@ -275,15 +310,30 @@ impl Slide { Some(images) } + + fn path_to_file_url(&self, path: &Path) -> Option { + let abs_path = path.canonicalize().ok()?; + let mut path_str = abs_path.to_string_lossy().replace('\\', "/"); + + // remove windows unc prefix + if cfg!(windows) { + if let Some(stripped) = path_str.strip_prefix("//?/") { + path_str = stripped.to_string(); + } + Some(format!("file:///{}", path_str)) + } else { + Some(format!("file://{}", path_str)) + } + } } #[cfg(test)] mod tests { use std::fs; use std::path::PathBuf; - + use super::*; - + fn mock_slide() -> Slide { Slide { rel_path: "ppt/slides/slide1.xml".to_string(),