Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,22 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [0.3.0] - _unreleased_

### Added

- Reworked the extraction of images by adding `ImageHandlingMode` to the `ParserConfig`. With this, users can decide to manually extract images and handle the logic [(#19)](https://github.com/nilskruthoff/pptx-parser/issues/19)
- New [example](https://github.com/nilskruthoff/pptx-parser/tree/master/examples) `manual_image_extraction.rs` to show how to handle images manually
- `ManualImage` struct to encapsulate data and meta data of images

### Removed

- `image_extraction` from [examples](https://github.com/nilskruthoff/pptx-parser/tree/master/examples) directory (replaced by `manual_image_extraction.rs`)

### Changed

---

## [0.2.0] - 2025-06-15

### Added
Expand Down
19 changes: 13 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -62,12 +62,19 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {

## Config Parameters

| Parameter | Type | Default | Description |
|-----------|--------|-------|------------------------------------------------------------------------------------------------------------------------------------------------------------|
| `extract_images` | `bool` | `true` | Whether images are extracted from slides or not |
| `compress_images` | `bool` | `true` | Whether images are compressed before encoding or not |
| `image_quality` | `u8` | `80` | Defines the image compression quality `(0-100)`. Higher values mean better quality but larger file sizes. |

| Parameter | Type | Default | Description |
|--------------------------|-----------------------|---------------|-----------------------------------------------------------------------------------------------------------|
| `extract_images` | `bool` | `true` | Whether images are extracted from slides or not. If false, images can not be extracted manually either. |
| `compress_images` | `bool` | `true` | Whether images are compressed before encoding or not. Effects manually extracted images too. |
| `image_quality` | `u8` | `80` | Defines the image compression quality `(0-100)`. Higher values mean better quality but larger file sizes. |
| `image_handling_mode` | `ImageHandlingMode` | `InMarkdown` | Determines how images are handled during content export |
<br/>

#### Member of `ImageHandlingMode`
| Member | Description |
|-----------------|-------------------------------------------------------------------------------------------------------|
| `InMarkdown` | Images are embedded directly in the Markdown output using standard syntax as `base64` data (`![]()`) |
| `Manually` | Image handling is delegated to the user, requiring manual copying or referencing (as `base64`) |
---

## 🏗 Project Structure
Expand Down
5 changes: 4 additions & 1 deletion examples/basic_usage.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
//!
//! Run with: cargo run --example basic_usage <path/to/your/presentation.pptx>

use pptx_to_md::{PptxContainer, Result, ParserConfig};
use pptx_to_md::{PptxContainer, Result, ParserConfig, ImageHandlingMode};
use std::env;
use std::fs::File;
use std::io::Write;
Expand All @@ -25,6 +25,9 @@ fn main() -> Result<()> {
// Use the config builder to build your config
let config = ParserConfig::builder()
.extract_images(true)
.compress_images(true)
.quality(75)
.image_handling_mode(ImageHandlingMode::InMarkdown)
.build();

// Open the PPTX file
Expand Down
11 changes: 4 additions & 7 deletions examples/image_extraction.rs
Original file line number Diff line number Diff line change
Expand Up @@ -66,13 +66,10 @@ fn main() -> Result<()> {
ext
);

match image_data {
Some(image_data) => {
fs::write(&output_path, image_data)?;
println!("Saved image to {}", output_path);
image_count += 1;
},
None => {}
if let Some(image_data) = image_data {
fs::write(&output_path, image_data)?;
println!("Saved image to {}", output_path);
image_count += 1;
}
}
}
Expand Down
95 changes: 95 additions & 0 deletions examples/manual_image_extraction.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
//! Basic usage example for the pptx-to-md crate
//!
//! This example demonstrates how to open a PPTX file and convert all slides to Markdown.
//!
//! Run with: cargo run --example manual_image_extraction <path/to/your/presentation.pptx>

use pptx_to_md::{PptxContainer, Result, ParserConfig, ImageHandlingMode};
use std::{env, fs};
use std::fs::File;
use std::io::Write;
use std::path::Path;
use base64::Engine;
use base64::engine::general_purpose;

fn main() -> Result<()> {
// Get the PPTX file path from command line arguments
let args: Vec<String> = env::args().collect();
let pptx_path = if args.len() > 1 {
&args[1]
} else {
eprintln!("Usage: cargo run --example manual_image_extraction <path/to/presentation.pptx>");
return Ok(());
};

println!("Processing PPTX file: {}", pptx_path);

// Use the config builder to build your config
let config = ParserConfig::builder()
.extract_images(true)
.compress_images(true)
.quality(75)
.image_handling_mode(ImageHandlingMode::Manually)
.build();

// Open the PPTX file
let mut container = PptxContainer::open(Path::new(pptx_path), config)?;

// Parse all slides
let slides = container.parse_all()?;

println!("Found {} slides", slides.len());

// create a new Markdown file
let mut md_file = File::create("output.md")?;

// Create output directory
let output_dir = "extracted_images";
fs::create_dir_all(output_dir)?;

// Process slides one by one using the iterator
let mut image_count = 1;

// Convert each slide to Markdown and save
for slide in slides {
if let Some(md_content) = slide.convert_to_md() {
writeln!(md_file, "{}", md_content).expect("Couldn't write to file");
}

// Manually load the base64 encoded image strings from the slide
if let Some(images) = slide.load_images_manually() {
for image in images {

// Decode the base64 strings back to raw image data
let image_data = general_purpose::STANDARD.decode(image.base64_content.clone()).unwrap();

// Extract image extension if the image is not compressed, otherwise its always `.jpg`
let ext = slide.config.compress_images
.then(|| "jpg".to_string())
.unwrap_or_else(|| slide.get_image_extension(&image.img_ref.target.clone()));

// Construct a unique file name
let file_name = format!("slide{}_image{}_{}", slide.slide_number, image_count, &image.img_ref.id);

// Save the image
let output_path = format!(
"{}/{}.{}",
output_dir,
&file_name,
ext
);
fs::write(&output_path, image_data)?;
println!("Saved image to {}", output_path);

// Write the image data into the Markdown file
writeln!(md_file, "![{}](data:image/{};base64,{})", file_name, ext, image.base64_content).expect("Couldn't write to file");

image_count += 1;
}
}
}

println!("All slides converted successfully!");

Ok(())
}
2 changes: 1 addition & 1 deletion src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ pub mod parse_rels;
mod parser_config;

pub use container::PptxContainer;
pub use parser_config::ParserConfig;
pub use parser_config::{ParserConfig, ImageHandlingMode};
pub use slide::Slide;
pub use types::*;

Expand Down
37 changes: 31 additions & 6 deletions src/parser_config.rs
Original file line number Diff line number Diff line change
@@ -1,15 +1,30 @@
/// Configuration options for the PPTX parser.
/// Determines how images are handled during content export.
///
/// # Members
///
/// | Member | Description |
/// |-----------------------|-----------------------------------------------------------------------------------------------------------------------|
/// | `InMarkdown` | Images are embedded directly in the Markdown output using standard syntax as `base64` data (`![]()`) |
/// | `Manually` | Image handling is delegated to the user, requiring manual copying or referencing (as `base64` encoded string) |
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum ImageHandlingMode {
InMarkdown,
Manually
}

/// Configuration options for the PPTX parser.
///
/// Use [`ParserConfig::builder()`] to create a configuration instance.
/// This allows you to customize only the desired fields while falling back to sensible defaults for the rest.
///
/// # Configuration Options
///
/// | Parameter | Type | Default | Description |
/// |-----------|------|---------|-------------|
/// | `extract_images` | `bool` | `true` | Whether images are extracted from slides or not |
/// | `compress_images` | `bool` | `true` | Whether images are compressed before encoding or not |
/// | `image_quality` | `u8` | `80` | Compression level (0-100);<br/> higher values retain more detail but increase file size |
/// | Parameter | Type | Default | Description |
/// |---------------------------|-----------------------|---------------|-----------------------------------------------------------------------------------------------------------|
/// | `extract_images` | `bool` | `true` | Whether images are extracted from slides or not. If false, images can not be extracted manually either. |
/// | `compress_images` | `bool` | `true` | Whether images are compressed before encoding or not. Effects manually extracted images too. |
/// | `image_quality` | `u8` | `80` | Compression level (0-100);<br/> higher values retain more detail but increase file size |
/// | `image_handling_mode` | `ImageHandlingMode` | `InMarkdown` | Determines how images are handled during content export. |
///
/// # Example
///
Expand All @@ -25,6 +40,7 @@ pub struct ParserConfig {
pub extract_images: bool,
pub compress_images: bool,
pub quality: u8,
pub image_handling_mode: ImageHandlingMode,
}

impl Default for ParserConfig {
Expand All @@ -33,6 +49,7 @@ impl Default for ParserConfig {
extract_images: true,
compress_images: true,
quality: 80,
image_handling_mode: ImageHandlingMode::InMarkdown,
}
}
}
Expand All @@ -51,6 +68,7 @@ pub struct ParserConfigBuilder {
extract_images: Option<bool>,
compress_images: Option<bool>,
image_quality: Option<u8>,
image_handling_mode: Option<ImageHandlingMode>,
}

impl ParserConfigBuilder {
Expand All @@ -73,13 +91,20 @@ impl ParserConfigBuilder {
self
}

/// Specifies the mode for processing the image after its extracted
pub fn image_handling_mode(mut self, value: ImageHandlingMode) -> Self {
self.image_handling_mode = Some(value);
self
}


/// Builds the final [`ParserConfig`] instance, applying default values for any fields that were not set.
pub fn build(self) -> ParserConfig {
ParserConfig {
extract_images: self.extract_images.unwrap_or(true),
compress_images: self.compress_images.unwrap_or(true),
quality: self.image_quality.unwrap_or(80),
image_handling_mode: self.image_handling_mode.unwrap_or(ImageHandlingMode::InMarkdown),
}
}
}
63 changes: 56 additions & 7 deletions src/slide.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,23 @@ use std::collections::HashMap;
use std::io::Cursor;
use std::path::Path;
use image::ImageOutputFormat;
use crate::parser_config::ImageHandlingMode;

/// Encapsulates images for manual extraction of images from slides
#[derive(Debug)]
pub struct ManualImage {
pub base64_content: String,
pub img_ref: ImageReference,
}

impl ManualImage {
pub fn new(base64_content: String, img_ref: ImageReference) -> ManualImage {
Self {
base64_content,
img_ref,
}
}
}

/// Represents a single slide extracted from a PowerPoint (pptx) file.
///
Expand Down Expand Up @@ -91,10 +108,12 @@ impl Slide {
slide_txt.push('\n');
},
SlideElement::Image(image_ref) => {
if self.config.image_handling_mode != ImageHandlingMode::InMarkdown { slide_txt.push('\n'); continue; }

if let Some(image_data) = self.image_data.get(&image_ref.id) {
let image_data = self.config.compress_images
.then(|| self.compress_image(image_data))
.unwrap_or(Option::from(image_data.clone()));
.unwrap_or_else(|| Option::from(image_data.clone()));

let base64_string = general_purpose::STANDARD.encode(image_data?);
let image_name = &image_ref.target.split('/').last()?;
Expand All @@ -119,10 +138,10 @@ impl Slide {
counters.resize(level + 1, 0);
}

if level > previous_level {
counters[level] = 0;
} else if level < previous_level {
counters.truncate(level + 1);
match level.cmp(&previous_level) {
std::cmp::Ordering::Greater => counters[level] = 0,
std::cmp::Ordering::Less => counters.truncate(level + 1),
std::cmp::Ordering::Equal => {}
}

counters[level] += 1;
Expand Down Expand Up @@ -226,6 +245,36 @@ impl Slide {
None
}
}

pub fn load_images_manually(&self) -> Option<Vec<ManualImage>> {
let mut images: Vec<ManualImage> = Vec::new();

let image_refs: Vec<&ImageReference> = self.elements
.iter()
.filter_map(|element| match element {
SlideElement::Image(ref img) => Some(img),
_ => None,
})
.collect();

for image_ref in image_refs {
if let Some(image_data) = self.image_data.get(&image_ref.id) {
let image_data = self.config.compress_images
.then( | | self.compress_image(image_data))
.unwrap_or_else(|| Option::from(image_data.clone()));

let base64_str = general_purpose::STANDARD.encode(image_data?);

let image = ManualImage::new(
base64_str,
image_ref.clone(),
);
images.push(image);
}
}

Some(images)
}
}

#[cfg(test)]
Expand Down Expand Up @@ -296,7 +345,7 @@ mod tests {

let raw_image = load_image_data("example-image.jpg");

if let Some(compression_result) = slide.compress_image(&*raw_image) {
if let Some(compression_result) = slide.compress_image(&raw_image) {
assert!(compression_result.len() < raw_image.len());
} else {
panic!("Compression failed");
Expand All @@ -308,7 +357,7 @@ mod tests {
let slide = mock_slide();
let raw_image = load_image_data("example-image.jpg");

if let Some(compression_result) = slide.compress_image(&*raw_image) {
if let Some(compression_result) = slide.compress_image(&raw_image) {
let result = image::load_from_memory(&compression_result);
assert!(result.is_ok());
} else {
Expand Down
2 changes: 1 addition & 1 deletion src/types.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ pub enum SlideElement {
Unknown,
}

#[derive(Debug)]
#[derive(Debug, Clone)]
pub struct ImageReference {
pub id: String,
pub target: String,
Expand Down