diff --git a/Cargo.toml b/Cargo.toml
index 6e6cefb..8f418b5 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -122,6 +122,9 @@ argon2 = "0.5"
# Lightweight regex
regex-lite = "0.1"
+# HTML entity decoding
+html-escape = "0.2"
+
# Testing
tokio-test = "0.4"
tempfile = "3"
diff --git a/crates/openfang-channels/Cargo.toml b/crates/openfang-channels/Cargo.toml
index 3e8e452..043f970 100644
--- a/crates/openfang-channels/Cargo.toml
+++ b/crates/openfang-channels/Cargo.toml
@@ -26,6 +26,7 @@ hmac = { workspace = true }
sha2 = { workspace = true }
base64 = { workspace = true }
hex = { workspace = true }
+html-escape = { workspace = true }
[dev-dependencies]
tokio-test = { workspace = true }
diff --git a/crates/openfang-channels/src/mastodon.rs b/crates/openfang-channels/src/mastodon.rs
index 21b9c42..9960b36 100644
--- a/crates/openfang-channels/src/mastodon.rs
+++ b/crates/openfang-channels/src/mastodon.rs
@@ -261,45 +261,42 @@ fn parse_mastodon_notification(
})
}
-/// Simple HTML tag stripper for Mastodon status content.
-///
-/// Mastodon returns HTML in status content. This strips tags and decodes
-/// common HTML entities. For production, consider a proper HTML sanitizer.
+/// Strip HTML tags and decode entities from Mastodon status content.
fn strip_html_tags(html: &str) -> String {
let mut result = String::with_capacity(html.len());
let mut in_tag = false;
+ let mut tag_buf = String::new();
for ch in html.chars() {
match ch {
'<' => {
in_tag = true;
- // Check if this is a
or
Hello π¦ world
"; + let result = strip_html_tags(html); + assert!(result.contains("Hello")); + assert!(result.contains("π¦")); + assert!(result.contains("world")); + assert!(!result.contains("")); + } + + #[test] + fn strip_html_handles_cjk_without_panic() { + let html = "
γγγ«γ‘γ―
δΈη
Hello world
"; + let result = strip_html_tags(html); + assert_eq!(result.trim(), "Hello world"); + } + #[test] fn test_parse_mastodon_notification_visibility() { let notif = serde_json::json!({