Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,9 @@ argon2 = "0.5"
# Lightweight regex
regex-lite = "0.1"

# HTML entity decoding
html-escape = "0.2"

# Testing
tokio-test = "0.4"
tempfile = "3"
Expand Down
1 change: 1 addition & 0 deletions crates/openfang-channels/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ hmac = { workspace = true }
sha2 = { workspace = true }
base64 = { workspace = true }
hex = { workspace = true }
html-escape = { workspace = true }

[dev-dependencies]
tokio-test = { workspace = true }
74 changes: 53 additions & 21 deletions crates/openfang-channels/src/mastodon.rs
Original file line number Diff line number Diff line change
Expand Up @@ -261,45 +261,42 @@ fn parse_mastodon_notification(
})
}

/// Simple HTML tag stripper for Mastodon status content.
///
/// Mastodon returns HTML in status content. This strips tags and decodes
/// common HTML entities. For production, consider a proper HTML sanitizer.
/// Strip HTML tags and decode entities from Mastodon status content.
fn strip_html_tags(html: &str) -> String {
let mut result = String::with_capacity(html.len());
let mut in_tag = false;
let mut tag_buf = String::new();

for ch in html.chars() {
match ch {
'<' => {
in_tag = true;
// Check if this is a <br> or </p> — insert newline
if html[result.len()..].starts_with("<br")
|| html[result.len()..].starts_with("</p")
tag_buf.clear();
}
'>' if in_tag => {
in_tag = false;
// Insert newline for block-level tags
let tag_lower = tag_buf.to_lowercase();
if tag_lower.starts_with("br")
|| tag_lower.starts_with("/p")
|| tag_lower.starts_with("/div")
|| tag_lower.starts_with("/li")
{
result.push('\n');
}
}
'>' => {
in_tag = false;
_ if in_tag => {
tag_buf.push(ch);
}
_ if !in_tag => {
_ => {
result.push(ch);
}
_ => {}
}
}

// Decode common HTML entities
result
.replace("&amp;", "&")
.replace("&lt;", "<")
.replace("&gt;", ">")
.replace("&quot;", "\"")
.replace("&#39;", "'")
.replace("&apos;", "'")
.trim()
.to_string()
// Decode HTML entities using html_escape crate
let decoded = html_escape::decode_html_entities(&result);
decoded.trim().to_string()
}

#[async_trait]
Expand Down Expand Up @@ -639,6 +636,41 @@ mod tests {
assert!(parse_mastodon_notification(&notif, "acct-999").is_none());
}

#[test]
fn strip_html_handles_emoji_without_panic() {
// This crashes the current implementation because emoji are multi-byte
let html = "<p>Hello 🦀 world</p>";
let result = strip_html_tags(html);
assert!(result.contains("Hello"));
assert!(result.contains("🦀"));
assert!(result.contains("world"));
assert!(!result.contains("<p>"));
}

#[test]
fn strip_html_handles_cjk_without_panic() {
let html = "<p>こんにちは<br>世界</p>";
let result = strip_html_tags(html);
assert!(result.contains("こんにちは"));
assert!(result.contains("世界"));
}

#[test]
fn strip_html_decodes_numeric_entities() {
let html = "curly&#8217;s &amp; &#x2019;";
let result = strip_html_tags(html);
assert!(result.contains("&"), "Should decode &amp;");
// Numeric entities should at least not crash
assert!(!result.contains("&#8217;") || result.contains("\u{2019}"));
}

#[test]
fn strip_html_basic_tags() {
let html = "<p>Hello <b>world</b></p>";
let result = strip_html_tags(html);
assert_eq!(result.trim(), "Hello world");
}

#[test]
fn test_parse_mastodon_notification_visibility() {
let notif = serde_json::json!({
Expand Down