Intermediate
Text Moderation
Build the text moderation module with toxicity detection using OpenAI Moderation API, PII filtering with regex patterns, and custom banned word/pattern rules.
OpenAI Moderation API
The OpenAI Moderation API is free and detects: hate, harassment, self-harm, sexual content, violence, and more. We will use it as our primary text analyzer.
# app/moderation/text.py
import re
import logging
from dataclasses import dataclass
from openai import OpenAI
from app.config import get_settings
logger = logging.getLogger(__name__)
settings = get_settings()
@dataclass
class TextModerationResult:
flagged: bool
categories: dict[str, bool]
scores: dict[str, float]
pii_found: list[dict]
custom_matches: list[str]
max_score: float
recommended_action: str # "approve", "review", "reject"
class TextModerator:
def __init__(self):
self.client = OpenAI(api_key=settings.openai_api_key)
self.pii_patterns = {
"email": r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}",
"phone": r"\d{3}[-.]?\d{3}[-.]?\d{4}",
"ssn": r"\d{3}-\d{2}-\d{4}",
"credit_card": r"\d{4}[\s-]?\d{4}[\s-]?\d{4}[\s-]?\d{4}",
}
self.banned_words: list[str] = [] # Load from config
def moderate(self, text: str) -> TextModerationResult:
# OpenAI moderation
response = self.client.moderations.create(input=text)
result = response.results[0]
categories = {k: v for k, v in result.categories.__dict__.items()
if not k.startswith("_")}
scores = {k: v for k, v in result.category_scores.__dict__.items()
if not k.startswith("_")}
max_score = max(scores.values()) if scores else 0.0
# PII detection
pii_found = self._detect_pii(text)
# Custom rules
custom_matches = self._check_custom_rules(text)
# Determine action
if max_score >= settings.moderation_threshold or result.flagged:
action = "reject"
elif max_score >= settings.review_threshold or pii_found or custom_matches:
action = "review"
else:
action = "approve"
return TextModerationResult(
flagged=result.flagged, categories=categories,
scores=scores, pii_found=pii_found,
custom_matches=custom_matches,
max_score=max_score, recommended_action=action,
)
def _detect_pii(self, text: str) -> list[dict]:
found = []
for pii_type, pattern in self.pii_patterns.items():
matches = re.findall(pattern, text)
for match in matches:
found.append({"type": pii_type, "value": match[:4] + "***"})
return found
def _check_custom_rules(self, text: str) -> list[str]:
text_lower = text.lower()
return [w for w in self.banned_words if w.lower() in text_lower]
Testing Text Moderation
from app.moderation.text import TextModerator
mod = TextModerator()
# Test safe content
result = mod.moderate("Hello, this is a normal message about the weather.")
print(f"Action: {result.recommended_action}") # approve
print(f"Max score: {result.max_score:.4f}")
# Test with PII
result = mod.moderate("My email is test@example.com and SSN is 123-45-6789")
print(f"Action: {result.recommended_action}") # review
print(f"PII found: {result.pii_found}")
# Test toxic content
result = mod.moderate("I hate you and want to hurt you badly")
print(f"Action: {result.recommended_action}") # reject
print(f"Categories: {[k for k, v in result.categories.items() if v]}")
PII redaction: For production, replace detected PII with placeholders before storing content. Use
re.sub(pattern, "[REDACTED]", text) to strip sensitive data while keeping the moderation decision.Key Takeaways
- OpenAI Moderation API is free and covers hate, harassment, self-harm, sexual, and violence categories.
- PII detection uses regex patterns for emails, phones, SSNs, and credit card numbers.
- Two-threshold system: high scores auto-reject, medium scores go to human review, low scores auto-approve.
- Custom banned word lists add organization-specific content rules.
Lilly Tech Systems