From 7cd66a677ebfe24d9a2ed0598ab32004ac4f9f9d Mon Sep 17 00:00:00 2001 From: SF-Zhou Date: Fri, 31 Oct 2025 14:04:31 +0800 Subject: [PATCH] update documentation --- README.md | 10 ++++++++++ examples/massmap.rs | 2 +- src/builder.rs | 25 ++++++++++++++++--------- src/massmap.rs | 8 ++++++-- src/meta.rs | 3 ++- src/reader.rs | 4 +++- 6 files changed, 38 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index 17265d3..5fd331c 100644 --- a/README.md +++ b/README.md @@ -43,6 +43,7 @@ fn main() -> std::io::Result<()> { MassMapBuilder::default() .with_hash_seed(42) .with_bucket_count(1024) + .with_bucket_size_limit(16 << 10) .build(file, entries.iter())?; // Read-only lookup phase. @@ -132,6 +133,15 @@ hexdump -C examples/demo.massmap #> 00000290 ``` +## Configuration + +- `with_hash_seed(seed)`: choose deterministic sharding. +- `with_bucket_count(count)`: trade memory for faster lookups. +- `with_writer_buffer_size(bytes)`: tune streaming IO throughput. +- `with_field_names(true)`: emit MessagePack maps with named fields for easier debugging. +- `with_bucket_size_limit(bytes)`: guard against oversized buckets. +- Replace the default [`MassMapHashLoader`](https://docs.rs/massmap/latest/massmap/trait.MassMapHashLoader.html) to plug in custom hashers. + ## Readers and Writers `MassMapReader` and `MassMapWriter` abstract over positional IO. The traits are implemented for `std::fs::File` out of the box, but they can also wrap network storage, memory-mapped regions, or custom paged backends. Override `MassMapReader::batch_read_at` to dispatch vectored reads when available. diff --git a/examples/massmap.rs b/examples/massmap.rs index 7b25140..8d9c055 100644 --- a/examples/massmap.rs +++ b/examples/massmap.rs @@ -100,7 +100,7 @@ impl MassMapHashLoader for MassMapTolerableHashLoader { fn run_info(args: InfoArgs) -> Result<()> { let file = File::open(&args.input)?; - let map = MassMap::::load(file)?; + let map = MassMap::::load(file)?; let json = serde_json::to_string_pretty(&map.info()) .map_err(|e| Error::other(format!("Failed to format JSON: {e}")))?; diff --git a/src/builder.rs b/src/builder.rs index b706f95..38b57e7 100644 --- a/src/builder.rs +++ b/src/builder.rs @@ -9,12 +9,14 @@ use crate::{ /// Builder type for emitting massmap files from key-value iterators. /// -/// The builder owns configuration such as hash seed, bucket sizing and IO -/// buffering. Use [`build`](Self::build) to stream MessagePack-encoded buckets to -/// a [`MassMapWriter`] sink (typically a file implementing `FileExt`). +/// The builder owns configuration such as the hash seed, bucket sizing, IO +/// buffering, field-name emission, and optional bucket size guards. Use +/// [`build`](Self::build) to stream MessagePack-encoded buckets to a +/// [`MassMapWriter`] sink (typically a file implementing `FileExt`). /// -/// Cloning is not required; each builder instance is consumed by a single call -/// to [`build`](Self::build). +/// The loader type parameter `H` allows swapping in custom +/// [`MassMapHashLoader`] implementations. Each builder instance is consumed by a +/// single call to [`build`](Self::build). #[derive(Debug)] pub struct MassMapBuilder { hash_config: MassMapHashConfig, @@ -75,14 +77,18 @@ impl MassMapBuilder { /// Controls whether serialized MessagePack maps include field names. /// - /// Enabling this makes the output human readable at the cost of slightly - /// larger files. + /// Enabling this makes the serialized buckets human readable at the cost + /// of slightly larger files and additional encoding work. pub fn with_field_names(mut self, value: bool) -> Self { self.field_names = value; self } /// Sets a hard cap on the number of bytes allowed per bucket payload. + /// + /// Buckets that exceed this limit cause [`build`](Self::build) to abort + /// with `ErrorKind::InvalidData`, which can be useful when targeting + /// systems with strict per-request IO ceilings. pub fn with_bucket_size_limit(mut self, limit: u32) -> Self { self.bucket_size_limit = limit; self @@ -91,8 +97,9 @@ impl MassMapBuilder { /// Consumes the builder and writes a massmap to `writer` from `entries`. /// /// The iterator is hashed according to the configured parameters, buckets - /// are serialized via `rmp-serde`, and a [`MassMapInfo`] summary is returned - /// on success. + /// are serialized via `rmp-serde`, and a [`MassMapInfo`] summary is + /// returned on success. Input ordering does not matter; keys are + /// automatically distributed across buckets. /// /// # Errors /// diff --git a/src/massmap.rs b/src/massmap.rs index 59da10b..110e982 100644 --- a/src/massmap.rs +++ b/src/massmap.rs @@ -19,6 +19,8 @@ use crate::{ /// - `K`: key type stored in the map; must implement `serde::Deserialize`. /// - `V`: value type stored in the map; must implement `serde::Deserialize` and `Clone`. /// - `R`: reader that satisfies [`MassMapReader`]. +/// - `H`: hash loader used to reconstruct the [`BuildHasher`](BuildHasher) from +/// the persisted [`MassMapHashConfig`](crate::MassMapHashConfig). #[derive(Debug)] pub struct MassMap { /// Header serialized at the start of the massmap file. @@ -165,7 +167,8 @@ where /// /// The iterator reads each bucket sequentially from the backing storage, /// deserializes all entries in the bucket, and yields them one at a time. - /// Each bucket is fully loaded into memory before any of its entries are yielded. + /// Each bucket is fully loaded into memory before any of its entries are + /// yielded. Iteration stops immediately if a bucket fails to deserialize. /// /// # Examples /// @@ -230,7 +233,8 @@ where /// Iterator over all entries in a [`MassMap`]. /// /// This iterator traverses buckets sequentially, loading each bucket fully into -/// memory before yielding its entries one by one. +/// memory before yielding its entries one by one. Items are returned as +/// `Result`s so that IO or deserialization failures propagate to the caller. pub struct MassMapIter<'a, K, V, R: MassMapReader, H: MassMapHashLoader> { map: &'a MassMap, bucket_index: usize, diff --git a/src/meta.rs b/src/meta.rs index 0c11e6c..96924dd 100644 --- a/src/meta.rs +++ b/src/meta.rs @@ -72,7 +72,8 @@ pub struct MassMapMeta { pub bucket_count: u64, /// Number of empty buckets. pub empty_buckets: u64, - /// Hash configuration. + /// Hash configuration used to derive the [`BuildHasher`](std::hash::BuildHasher) + /// when reopening the map. pub hash_config: MassMapHashConfig, } diff --git a/src/reader.rs b/src/reader.rs index caa496b..d7285ce 100644 --- a/src/reader.rs +++ b/src/reader.rs @@ -3,7 +3,9 @@ use std::{borrow::Borrow, io::Result}; /// Trait abstracting read access to massmap files. /// /// Implementations must support positional reads without mutating shared state. -/// The trait is blanket-implemented for platform-specific `FileExt` handles. +/// The trait is blanket-implemented for platform-specific `FileExt` handles, +/// but can also wrap memory-mapped regions or networked block stores. Override +/// [`batch_read_at`](Self::batch_read_at) to surface vectored IO capabilities. pub trait MassMapReader { /// Reads `length` bytes starting at `offset` and forwards them to `f`. ///