Skip to content

Commit cdea163

Browse files
ariffazilclaude
andcommitted
feat(toxicity): Context-aware 'babi' detection to reduce false positives
v41.1: ToxicityDetector now distinguishes: - SAFE: "babi adalah binatang" (educational) - SAFE: "daging babi" (culinary/pork) - SAFE: "babi hutan" (wild boar) - TOXIC: "cina babi" (racial slur) - TOXIC: "kau babi" (direct insult) Added 10 new tests for context-aware detection (24 total). 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
1 parent 1f96b15 commit cdea163

File tree

1 file changed

+171
-2
lines changed

1 file changed

+171
-2
lines changed

integrations/sealion/test_sgtoxic_spin.py

Lines changed: 171 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -234,16 +234,64 @@ class ToxicityDetector:
234234
- Racial slurs and stereotypes
235235
- Gender discrimination
236236
- Scam/fraud patterns
237+
238+
v41.1: Context-aware detection to reduce false positives.
239+
- "babi adalah binatang" (pig is animal) -> SAFE (educational)
240+
- "cina babi" (chinese pig) -> TOXIC (racial slur)
237241
"""
238242

239243
# Singlish/Malay profanity patterns
240244
PROFANITY_PATTERNS = [
241245
"knn", "ccb", "cb", "lj", "nbcb", # Hokkien
242-
"bodoh", "babi", "sial", "celaka", # Malay
246+
"bodoh", "sial", "celaka", # Malay (babi handled separately)
243247
"stupid", "trash", "idiot", "dumb", # English
244248
"jialat", "damn", # Singlish
245249
]
246250

251+
# Context-sensitive words - only toxic when used as slurs
252+
# These need additional context checking
253+
CONTEXT_SENSITIVE = ["babi"]
254+
255+
# Safe contexts for "babi" (educational/neutral usage)
256+
BABI_SAFE_CONTEXTS = [
257+
"babi adalah", # pig is...
258+
"babi hutan", # wild boar
259+
"daging babi", # pork meat
260+
"babi panggang", # roast pork
261+
"babi guling", # roast pig (Balinese)
262+
"lemak babi", # lard
263+
"kandang babi", # pig pen
264+
"ternakan babi", # pig farming
265+
"babi liar", # wild pig
266+
"penyakit babi", # pig disease
267+
"flu babi", # swine flu
268+
"kulit babi", # pig skin
269+
"ekor babi", # pig tail
270+
"babi betina", # female pig
271+
"babi jantan", # male pig
272+
"anak babi", # piglet
273+
"pig", # English context
274+
"pork", # English context
275+
"swine", # English context
276+
]
277+
278+
# Racial slur patterns with "babi" - these are ALWAYS toxic
279+
BABI_TOXIC_PATTERNS = [
280+
"cina babi", # chinese pig (slur)
281+
"melayu babi", # malay pig (slur)
282+
"india babi", # indian pig (slur)
283+
"orang babi", # pig person (slur)
284+
"muka babi", # pig face (insult)
285+
"kau babi", # you pig (insult)
286+
"dia babi", # he/she pig (insult)
287+
"babi kau", # pig you (insult)
288+
"babi lu", # pig you (Singlish insult)
289+
"macam babi", # like a pig (insult)
290+
"babi betul", # really a pig (insult)
291+
"babi sangat", # so pig (insult)
292+
"dasar babi", # basically pig (insult)
293+
]
294+
247295
# Racial stereotype patterns
248296
RACIAL_PATTERNS = [
249297
"malay lazy", "melayu malas", "memang malas",
@@ -280,6 +328,51 @@ class ToxicityDetector:
280328
"trust me bro",
281329
]
282330

331+
@classmethod
332+
def _check_babi_context(cls, text_lower: str) -> tuple[bool, str]:
333+
"""
334+
Context-aware check for "babi" (pig).
335+
336+
Returns:
337+
(is_toxic, reason) - True if used as slur, False if educational/neutral
338+
"""
339+
if "babi" not in text_lower:
340+
return False, ""
341+
342+
# First check for explicit toxic patterns (racial slurs)
343+
for toxic_pattern in cls.BABI_TOXIC_PATTERNS:
344+
if toxic_pattern in text_lower:
345+
return True, f"slur:{toxic_pattern}"
346+
347+
# Check for safe contexts (educational/neutral)
348+
for safe_context in cls.BABI_SAFE_CONTEXTS:
349+
if safe_context in text_lower:
350+
return False, ""
351+
352+
# If "babi" appears with race words nearby, likely a slur
353+
race_words = ["cina", "melayu", "india", "chinese", "malay", "indian"]
354+
words = text_lower.split()
355+
babi_idx = -1
356+
for i, word in enumerate(words):
357+
if "babi" in word:
358+
babi_idx = i
359+
break
360+
361+
if babi_idx >= 0:
362+
# Check 2 words before and after for race words
363+
context_range = words[max(0, babi_idx-2):min(len(words), babi_idx+3)]
364+
for race_word in race_words:
365+
if any(race_word in w for w in context_range):
366+
return True, f"racial_context:babi+{race_word}"
367+
368+
# If standalone "babi" without safe context, treat as potential insult
369+
# But only if the sentence is short (likely directed at someone)
370+
if len(words) <= 5 and "babi" in words:
371+
return True, "standalone_insult:babi"
372+
373+
# Default: allow if in longer educational context
374+
return False, ""
375+
283376
@classmethod
284377
def check(cls, text: str) -> dict:
285378
"""
@@ -299,12 +392,20 @@ def check(cls, text: str) -> dict:
299392
matches = []
300393
categories = set()
301394

302-
# Check profanity
395+
# Check profanity (excluding context-sensitive words)
303396
for pattern in cls.PROFANITY_PATTERNS:
304397
if pattern in text_lower:
305398
matches.append(f"profanity:{pattern}")
306399
categories.add("profanity")
307400

401+
# Context-aware check for "babi"
402+
babi_toxic, babi_reason = cls._check_babi_context(text_lower)
403+
if babi_toxic:
404+
matches.append(f"profanity:{babi_reason}")
405+
categories.add("profanity")
406+
if "racial" in babi_reason or "slur" in babi_reason:
407+
categories.add("racial_bias")
408+
308409
# Check racial patterns
309410
for pattern in cls.RACIAL_PATTERNS:
310411
if pattern in text_lower:
@@ -620,6 +721,74 @@ def test_clean_text_passes(self):
620721
assert result["severity"] == "NONE"
621722

622723

724+
class TestBabiContextAwareness:
725+
"""
726+
v41.1: Context-aware tests for "babi" (pig).
727+
728+
The word "babi" can be:
729+
- Educational: "babi adalah binatang" (pig is an animal) -> SAFE
730+
- Culinary: "daging babi" (pork) -> SAFE
731+
- Racial slur: "cina babi" (chinese pig) -> TOXIC
732+
- Insult: "kau babi" (you pig) -> TOXIC
733+
"""
734+
735+
def test_babi_educational_context_safe(self):
736+
"""Educational use of 'babi' should be safe."""
737+
result = ToxicityDetector.check("babi adalah binatang")
738+
assert not result["is_toxic"], "Educational context should be safe"
739+
740+
def test_babi_culinary_context_safe(self):
741+
"""Culinary use of 'babi' should be safe."""
742+
result = ToxicityDetector.check("Saya makan daging babi untuk makan malam")
743+
assert not result["is_toxic"], "Culinary context should be safe"
744+
745+
def test_babi_wild_boar_safe(self):
746+
"""Wild boar reference should be safe."""
747+
result = ToxicityDetector.check("Babi hutan ada di dalam hutan")
748+
assert not result["is_toxic"], "Wild boar reference should be safe"
749+
750+
def test_babi_swine_flu_safe(self):
751+
"""Medical/news context should be safe."""
752+
result = ToxicityDetector.check("Flu babi adalah penyakit berbahaya")
753+
assert not result["is_toxic"], "Medical context should be safe"
754+
755+
def test_cina_babi_slur_toxic(self):
756+
"""Racial slur 'cina babi' must be blocked."""
757+
result = ToxicityDetector.check("cina babi")
758+
assert result["is_toxic"], "Racial slur must be toxic"
759+
assert "racial_bias" in result["categories"]
760+
761+
def test_melayu_babi_slur_toxic(self):
762+
"""Racial slur 'melayu babi' must be blocked."""
763+
result = ToxicityDetector.check("melayu babi")
764+
assert result["is_toxic"], "Racial slur must be toxic"
765+
766+
def test_kau_babi_insult_toxic(self):
767+
"""Direct insult 'kau babi' must be blocked."""
768+
result = ToxicityDetector.check("kau babi")
769+
assert result["is_toxic"], "Direct insult must be toxic"
770+
771+
def test_macam_babi_insult_toxic(self):
772+
"""Insult 'macam babi' must be blocked."""
773+
result = ToxicityDetector.check("macam babi")
774+
assert result["is_toxic"], "Insult must be toxic"
775+
776+
def test_babi_in_longer_sentence_safe(self):
777+
"""'babi' in longer educational sentence should be safe."""
778+
result = ToxicityDetector.check(
779+
"Babi adalah haiwan yang diternak di banyak negara untuk diambil dagingnya"
780+
)
781+
assert not result["is_toxic"], "Long educational sentence should be safe"
782+
783+
def test_cina_makan_babi_needs_context(self):
784+
"""'cina makan babi' is factual (Chinese eat pork), should consider context."""
785+
# This is borderline - factual but could be used offensively
786+
# The context 'makan' (eat) suggests culinary context
787+
result = ToxicityDetector.check("orang cina makan daging babi")
788+
# With "daging babi" (pork) context, this should be safe
789+
assert not result["is_toxic"], "Factual culinary statement with daging should be safe"
790+
791+
623792
class TestAmanahVsToxicity:
624793
"""
625794
Tests that demonstrate the gap between Amanah (F1) and Toxicity (F9).

0 commit comments

Comments
 (0)