From 218da8ec2ff88e51d80e3e2e5731beddd70f89d6 Mon Sep 17 00:00:00 2001
From: duongvandinh
Date: Fri, 27 Feb 2026 11:03:53 +0700
Subject: [PATCH 1/2] fix(mastodon): replace hand-rolled HTML sanitizer with
html_escape
The previous strip_html_tags() used html[result.len()..] byte indexing
on a char-iterated string, causing panics on multi-byte UTF-8 content
(emoji, CJK). Replace with a safe char-based tag stripper and use
html_escape::decode_html_entities for comprehensive entity decoding
including numeric entities (’, ’).
---
crates/openfang-channels/Cargo.toml | 1 +
crates/openfang-channels/src/mastodon.rs | 74 +++++++++++++++++-------
2 files changed, 54 insertions(+), 21 deletions(-)
diff --git a/crates/openfang-channels/Cargo.toml b/crates/openfang-channels/Cargo.toml
index 3e8e452..ed9c15d 100644
--- a/crates/openfang-channels/Cargo.toml
+++ b/crates/openfang-channels/Cargo.toml
@@ -26,6 +26,7 @@ hmac = { workspace = true }
sha2 = { workspace = true }
base64 = { workspace = true }
hex = { workspace = true }
+html-escape = "0.2"
[dev-dependencies]
tokio-test = { workspace = true }
diff --git a/crates/openfang-channels/src/mastodon.rs b/crates/openfang-channels/src/mastodon.rs
index 21b9c42..9960b36 100644
--- a/crates/openfang-channels/src/mastodon.rs
+++ b/crates/openfang-channels/src/mastodon.rs
@@ -261,45 +261,42 @@ fn parse_mastodon_notification(
})
}
-/// Simple HTML tag stripper for Mastodon status content.
-///
-/// Mastodon returns HTML in status content. This strips tags and decodes
-/// common HTML entities. For production, consider a proper HTML sanitizer.
+/// Strip HTML tags and decode entities from Mastodon status content.
fn strip_html_tags(html: &str) -> String {
let mut result = String::with_capacity(html.len());
let mut in_tag = false;
+ let mut tag_buf = String::new();
for ch in html.chars() {
match ch {
'<' => {
in_tag = true;
- // Check if this is a
or
β insert newline
- if html[result.len()..].starts_with("
' if in_tag => {
+ in_tag = false;
+ // Insert newline for block-level tags
+ let tag_lower = tag_buf.to_lowercase();
+ if tag_lower.starts_with("br")
+ || tag_lower.starts_with("/p")
+ || tag_lower.starts_with("/div")
+ || tag_lower.starts_with("/li")
{
result.push('\n');
}
}
- '>' => {
- in_tag = false;
+ _ if in_tag => {
+ tag_buf.push(ch);
}
- _ if !in_tag => {
+ _ => {
result.push(ch);
}
- _ => {}
}
}
- // Decode common HTML entities
- result
- .replace("&", "&")
- .replace("<", "<")
- .replace(">", ">")
- .replace(""", "\"")
- .replace("'", "'")
- .replace("'", "'")
- .trim()
- .to_string()
+ // Decode HTML entities using html_escape crate
+ let decoded = html_escape::decode_html_entities(&result);
+ decoded.trim().to_string()
}
#[async_trait]
@@ -639,6 +636,41 @@ mod tests {
assert!(parse_mastodon_notification(¬if, "acct-999").is_none());
}
+ #[test]
+ fn strip_html_handles_emoji_without_panic() {
+ // This crashes the current implementation because emoji are multi-byte
+ let html = "Hello π¦ world
";
+ let result = strip_html_tags(html);
+ assert!(result.contains("Hello"));
+ assert!(result.contains("π¦"));
+ assert!(result.contains("world"));
+ assert!(!result.contains(""));
+ }
+
+ #[test]
+ fn strip_html_handles_cjk_without_panic() {
+ let html = "
γγγ«γ‘γ―
δΈη
";
+ let result = strip_html_tags(html);
+ assert!(result.contains("γγγ«γ‘γ―"));
+ assert!(result.contains("δΈη"));
+ }
+
+ #[test]
+ fn strip_html_decodes_numeric_entities() {
+ let html = "curly’s & ’";
+ let result = strip_html_tags(html);
+ assert!(result.contains("&"), "Should decode &");
+ // Numeric entities should at least not crash
+ assert!(!result.contains("’") || result.contains("\u{2019}"));
+ }
+
+ #[test]
+ fn strip_html_basic_tags() {
+ let html = "Hello world
";
+ let result = strip_html_tags(html);
+ assert_eq!(result.trim(), "Hello world");
+ }
+
#[test]
fn test_parse_mastodon_notification_visibility() {
let notif = serde_json::json!({
From 0a1d1347d9cf4731cacd251e85b96a91237a6b3c Mon Sep 17 00:00:00 2001
From: duongvandinh
Date: Fri, 27 Feb 2026 11:26:48 +0700
Subject: [PATCH 2/2] fix: use workspace dependency for html-escape
Move html-escape from direct crate dependency to workspace-level
declaration in root Cargo.toml, per CONTRIBUTING.md convention.
---
Cargo.toml | 3 +++
crates/openfang-channels/Cargo.toml | 2 +-
2 files changed, 4 insertions(+), 1 deletion(-)
diff --git a/Cargo.toml b/Cargo.toml
index 6e6cefb..8f418b5 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -122,6 +122,9 @@ argon2 = "0.5"
# Lightweight regex
regex-lite = "0.1"
+# HTML entity decoding
+html-escape = "0.2"
+
# Testing
tokio-test = "0.4"
tempfile = "3"
diff --git a/crates/openfang-channels/Cargo.toml b/crates/openfang-channels/Cargo.toml
index ed9c15d..043f970 100644
--- a/crates/openfang-channels/Cargo.toml
+++ b/crates/openfang-channels/Cargo.toml
@@ -26,7 +26,7 @@ hmac = { workspace = true }
sha2 = { workspace = true }
base64 = { workspace = true }
hex = { workspace = true }
-html-escape = "0.2"
+html-escape = { workspace = true }
[dev-dependencies]
tokio-test = { workspace = true }