From cbe43d914c2856210338fa9b170fe4ef575d1b13 Mon Sep 17 00:00:00 2001 From: Zumie Date: Fri, 13 Feb 2026 20:08:34 -0600 Subject: [PATCH 1/2] Fix audit/import-missing matching: use normalized blake3 hash --- crates/cli/src/main.rs | 40 ++++++++++++++++++++++++++++++++++------ 1 file changed, 34 insertions(+), 6 deletions(-) diff --git a/crates/cli/src/main.rs b/crates/cli/src/main.rs index d42de9d..05d820a 100644 --- a/crates/cli/src/main.rs +++ b/crates/cli/src/main.rs @@ -1425,6 +1425,7 @@ struct AuditFileResult { bytes: usize, matched: bool, matched_id: Option, + matched_hash: Option, } fn collect_memory_files(root: &Path) -> anyhow::Result> { @@ -1450,14 +1451,32 @@ fn collect_memory_files(root: &Path) -> anyhow::Result> { fn audit_files(root: &Path, files: &[PathBuf], items: &[MemoryItem]) -> AuditResult { let mut results = Vec::new(); let mut matched = 0; + + fn audit_norm(s: &str) -> &str { + // Normalize for file-memory matching. + // Treat trailing whitespace/newlines as non-semantic to avoid duplicate imports + // when editors add/remove a final newline. + s.trim_end() + } + + fn blake3_hex(s: &str) -> String { + blake3::hash(s.as_bytes()).to_hex().to_string() + } + for path in files { let content = std::fs::read_to_string(path).unwrap_or_default(); let bytes = content.len(); - let matched_item = items - .iter() - .find(|item| item.text == content) - .map(|item| item.id.clone()); - let is_match = matched_item.is_some(); + + let file_hash = blake3_hex(audit_norm(&content)); + + let matched_item = items.iter().find(|item| { + // Use hash match (normalized) instead of exact string equality. + // This prevents silent duplication due to trivial whitespace changes. + blake3_hex(audit_norm(&item.text)) == file_hash + }); + + let matched_id = matched_item.map(|item| item.id.clone()); + let is_match = matched_id.is_some(); if is_match { matched += 1; } @@ -1465,7 +1484,8 @@ fn audit_files(root: &Path, files: &[PathBuf], items: &[MemoryItem]) -> AuditRes file: path.display().to_string(), bytes, matched: is_match, - matched_id: matched_item, + matched_id, + matched_hash: if is_match { Some(file_hash) } else { None }, }); } @@ -1489,9 +1509,17 @@ async fn import_missing_files(db: &MemoryDb, audit: &AuditResult) -> anyhow::Res if content.trim().is_empty() { continue; } + + // Include a stable content hash tag to aid future dedupe/debugging. + // (Normalization matches audit behavior: ignore trailing whitespace.) + let content_hash = blake3::hash(content.trim_end().as_bytes()) + .to_hex() + .to_string(); + let tags = vec![ "file-memory".to_string(), format!("source:{}", path.display()), + format!("contenthash:blake3:{}", content_hash), ]; let item = MemoryItem { id: uuid::Uuid::new_v4().to_string(), From c331f73458f5b3a8a40533aa706e5589f01f1f8c Mon Sep 17 00:00:00 2001 From: Zumie Date: Fri, 13 Feb 2026 22:14:19 -0600 Subject: [PATCH 2/2] audit: match file-memory by source tag and report drift --- crates/cli/src/main.rs | 40 ++++++++++++++++++++++++++++++++-------- 1 file changed, 32 insertions(+), 8 deletions(-) diff --git a/crates/cli/src/main.rs b/crates/cli/src/main.rs index 05d820a..de35382 100644 --- a/crates/cli/src/main.rs +++ b/crates/cli/src/main.rs @@ -1423,9 +1423,15 @@ struct AuditResult { struct AuditFileResult { file: String, bytes: usize, + /// A file is considered matched if wagl already has a file-memory item with `source:`. matched: bool, matched_id: Option, - matched_hash: Option, + /// Current file content hash (normalized). + file_hash: String, + /// Stored item content hash (normalized), if a match was found. + stored_hash: Option, + /// True if a matched file's current content differs from the stored item text. + drifted: bool, } fn collect_memory_files(root: &Path) -> anyhow::Result> { @@ -1469,23 +1475,41 @@ fn audit_files(root: &Path, files: &[PathBuf], items: &[MemoryItem]) -> AuditRes let file_hash = blake3_hex(audit_norm(&content)); - let matched_item = items.iter().find(|item| { - // Use hash match (normalized) instead of exact string equality. - // This prevents silent duplication due to trivial whitespace changes. - blake3_hex(audit_norm(&item.text)) == file_hash - }); + let src_tag = format!("source:{}", path.display()); + let matched_id = items + .iter() + .find(|item| { + item.tags.iter().any(|t| t == "file-memory") + && item.tags.iter().any(|t| t == &src_tag) + }) + .map(|item| item.id.clone()); - let matched_id = matched_item.map(|item| item.id.clone()); let is_match = matched_id.is_some(); if is_match { matched += 1; } + + let stored_hash = matched_id.as_deref().and_then(|id| { + items + .iter() + .find(|item| item.id == id) + .map(|item| blake3_hex(audit_norm(&item.text))) + }); + + // If we matched by source tag, check whether content drifted (e.g., file edited). + let drifted = stored_hash + .as_deref() + .map(|h| h != file_hash.as_str()) + .unwrap_or(false); + results.push(AuditFileResult { file: path.display().to_string(), bytes, matched: is_match, matched_id, - matched_hash: if is_match { Some(file_hash) } else { None }, + file_hash, + stored_hash, + drifted, }); }