Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 5 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ sql-builder="3.1"
interner="*"
compressed_string = "*"
csv = "1.1"
flate2 = "1.0"
extended = "*"
ascii = "*"
bstr = "1.7.0"
Expand All @@ -33,6 +34,10 @@ path = "src/lib.rs"
name = "abacus"
path = "src/bin/abacus.rs"

[[bin]]
name = "dataversion"
path = "src/bin/dataversion.rs"

[[bench]]
name = "tabulate_simple_request_benchmark"
harness = false
84 changes: 84 additions & 0 deletions src/bin/dataversion.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
//! A command-line utility to extract version information from IPUMS data files.
//!
//! This tool reads version metadata from both Parquet and fixed-width IPUMS data files
//! and outputs it in either JSON or human-readable text format.
//!
//! # Usage
//!
//! ```bash
//! # For parquet data (directory containing .parquet files)
//! dataversion /pkg/ipums/usa/output_data/current/parquet/us2015b
//!
//! # For fixed-width data (.dat.gz file)
//! dataversion /pkg/ipums/usa/output_data/current/us2015b_usa.dat.gz
//!
//! # Output as JSON (default is text)
//! dataversion --format json /path/to/data
//! ```

use cimdea::data_version::{extract_version, DataVersion};
use clap::{Parser, ValueEnum};
use std::process;

#[derive(Parser, Debug)]
#[command(
name = "dataversion",
version,
about = "Extract version information from IPUMS data files",
long_about = "Extract version information from IPUMS data files.\n\n\
Supports both Parquet and fixed-width (.dat.gz) formats.\n\
Version information includes release numbers, commit hashes,\n\
branch names, and other build metadata."
)]
struct Args {
/// Path to the data file or directory.
///
/// For Parquet: path to a directory containing .parquet files
/// (e.g., /pkg/ipums/usa/output_data/current/parquet/us2015b)
///
/// For fixed-width: path to a .dat.gz file
/// (e.g., /pkg/ipums/usa/output_data/current/us2015b_usa.dat.gz)
#[arg(value_name = "PATH")]
path: String,

/// Output format
#[arg(short, long, value_enum, default_value = "text")]
format: OutputFormat,
}

#[derive(Debug, Clone, Copy, ValueEnum)]
enum OutputFormat {
/// Human-readable text output
Text,
/// Machine-readable JSON output
Json,
}

fn main() {
let args = Args::parse();

match extract_version(&args.path) {
Ok(version) => {
output_version(&version, args.format);
}
Err(e) => {
eprintln!("Error: {}", e);
process::exit(1);
}
}
}

fn output_version(version: &DataVersion, format: OutputFormat) {
match format {
OutputFormat::Text => {
println!("{}", version.to_text());
}
OutputFormat::Json => match version.to_json() {
Ok(json) => println!("{}", json),
Err(e) => {
eprintln!("Error serializing to JSON: {}", e);
process::exit(1);
}
},
}
}
13 changes: 5 additions & 8 deletions src/conventions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,8 @@ impl MicroDataCollection {
})?;

// Collect record types and filenames first to avoid borrow issues
let record_types: Vec<(String, String)> = self.record_types
let record_types: Vec<(String, String)> = self
.record_types
.keys()
.map(|k| {
let base_filename = self.base_filename_for_dataset_and_rectype(dataset_name, k);
Expand Down Expand Up @@ -254,10 +255,7 @@ impl MicroDataCollection {
/// Takes a path like ../output_data/current/parquet/, which could be derived
/// automatically from defaults based on data root or product root. Scans all
/// parquet schema information and embedded metadata.
pub fn load_metadata_from_all_parquet(
&mut self,
parquet_path: &Path,
) -> Result<(), MdError> {
pub fn load_metadata_from_all_parquet(&mut self, parquet_path: &Path) -> Result<(), MdError> {
if !parquet_path.exists() {
return Err(metadata_error!(
"Parquet path does not exist: {}",
Expand All @@ -278,9 +276,8 @@ impl MicroDataCollection {
let mut errors = Vec::new();

for entry in entries {
let entry = entry.map_err(|e| {
metadata_error!("Failed to read directory entry: {}", e)
})?;
let entry =
entry.map_err(|e| metadata_error!("Failed to read directory entry: {}", e))?;

let path = entry.path();
if path.is_dir() {
Expand Down
Loading
Loading