diff --git a/Cargo.toml b/Cargo.toml index 6e6cefb..8f418b5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -122,6 +122,9 @@ argon2 = "0.5" # Lightweight regex regex-lite = "0.1" +# HTML entity decoding +html-escape = "0.2" + # Testing tokio-test = "0.4" tempfile = "3" diff --git a/crates/openfang-channels/Cargo.toml b/crates/openfang-channels/Cargo.toml index 3e8e452..043f970 100644 --- a/crates/openfang-channels/Cargo.toml +++ b/crates/openfang-channels/Cargo.toml @@ -26,6 +26,7 @@ hmac = { workspace = true } sha2 = { workspace = true } base64 = { workspace = true } hex = { workspace = true } +html-escape = { workspace = true } [dev-dependencies] tokio-test = { workspace = true } diff --git a/crates/openfang-channels/src/mastodon.rs b/crates/openfang-channels/src/mastodon.rs index 21b9c42..9960b36 100644 --- a/crates/openfang-channels/src/mastodon.rs +++ b/crates/openfang-channels/src/mastodon.rs @@ -261,45 +261,42 @@ fn parse_mastodon_notification( }) } -/// Simple HTML tag stripper for Mastodon status content. -/// -/// Mastodon returns HTML in status content. This strips tags and decodes -/// common HTML entities. For production, consider a proper HTML sanitizer. +/// Strip HTML tags and decode entities from Mastodon status content. fn strip_html_tags(html: &str) -> String { let mut result = String::with_capacity(html.len()); let mut in_tag = false; + let mut tag_buf = String::new(); for ch in html.chars() { match ch { '<' => { in_tag = true; - // Check if this is a
or

β€” insert newline - if html[result.len()..].starts_with("' if in_tag => { + in_tag = false; + // Insert newline for block-level tags + let tag_lower = tag_buf.to_lowercase(); + if tag_lower.starts_with("br") + || tag_lower.starts_with("/p") + || tag_lower.starts_with("/div") + || tag_lower.starts_with("/li") { result.push('\n'); } } - '>' => { - in_tag = false; + _ if in_tag => { + tag_buf.push(ch); } - _ if !in_tag => { + _ => { result.push(ch); } - _ => {} } } - // Decode common HTML entities - result - .replace("&", "&") - .replace("<", "<") - .replace(">", ">") - .replace(""", "\"") - .replace("'", "'") - .replace("'", "'") - .trim() - .to_string() + // Decode HTML entities using html_escape crate + let decoded = html_escape::decode_html_entities(&result); + decoded.trim().to_string() } #[async_trait] @@ -639,6 +636,41 @@ mod tests { assert!(parse_mastodon_notification(¬if, "acct-999").is_none()); } + #[test] + fn strip_html_handles_emoji_without_panic() { + // This crashes the current implementation because emoji are multi-byte + let html = "

Hello πŸ¦€ world

"; + let result = strip_html_tags(html); + assert!(result.contains("Hello")); + assert!(result.contains("πŸ¦€")); + assert!(result.contains("world")); + assert!(!result.contains("

")); + } + + #[test] + fn strip_html_handles_cjk_without_panic() { + let html = "

こんにけは
δΈ–η•Œ

"; + let result = strip_html_tags(html); + assert!(result.contains("こんにけは")); + assert!(result.contains("δΈ–η•Œ")); + } + + #[test] + fn strip_html_decodes_numeric_entities() { + let html = "curly’s & ’"; + let result = strip_html_tags(html); + assert!(result.contains("&"), "Should decode &"); + // Numeric entities should at least not crash + assert!(!result.contains("’") || result.contains("\u{2019}")); + } + + #[test] + fn strip_html_basic_tags() { + let html = "

Hello world

"; + let result = strip_html_tags(html); + assert_eq!(result.trim(), "Hello world"); + } + #[test] fn test_parse_mastodon_notification_visibility() { let notif = serde_json::json!({