Generative AI

AI Content Moderation: Building Safe Applications

📅 December 05, 2025 ⏱️ 1 min read 👁️ 4 views 🏷️ Generative AI

AI content moderation protects users and ensures safe online spaces by detecting harmful content automatically.

OpenAI Moderation API


import openai

# Check content for violations
response = openai.moderations.create(
    input="Sample text to moderate"
)

result = response.results[0]

if result.flagged:
    print("Content flagged!")
    print(f"Categories: {result.categories}")
    print(f"Scores: {result.category_scores}")
else:
    print("Content is safe")

# Check specific categories
if result.categories.hate:
    print("Contains hate speech")
if result.categories.violence:
    print("Contains violent content")

Custom Moderation Pipeline


from transformers import pipeline

# Toxic comment classifier
toxicity_model = pipeline("text-classification", 
    model="unitary/toxic-bert")

def moderate_content(text):
    # Check toxicity
    toxicity = toxicity_model(text)[0]
    
    # Check with OpenAI
    openai_result = openai.moderations.create(input=text)
    
    # Combine results
    is_safe = (
        toxicity['label'] == 'non-toxic' and
        not openai_result.results[0].flagged
    )
    
    return {
        'safe': is_safe,
        'toxicity_score': toxicity['score'],
        'openai_flagged': openai_result.results[0].flagged
    }

result = moderate_content("Your text here")
print(result)

Image Moderation


from google.cloud import vision

# Google Vision AI for image moderation
client = vision.ImageAnnotatorClient()

with open('image.jpg', 'rb') as image_file:
    content = image_file.read()

image = vision.Image(content=content)
response = client.safe_search_detection(image=image)
safe = response.safe_search_annotation

# Check safety levels
if safe.adult > 3 or safe.violence > 3:
    print("Inappropriate content detected")
else:
    print("Image is safe")

Real-Time Moderation System


class ContentModerator:
    def __init__(self):
        self.toxicity_model = pipeline("text-classification",
            model="unitary/toxic-bert")
    
    def moderate_text(self, text):
        # Quick toxicity check
        toxicity = self.toxicity_model(text)[0]
        
        if toxicity['score'] > 0.8:
            return {'approved': False, 'reason': 'High toxicity'}
        
        # OpenAI moderation
        result = openai.moderations.create(input=text)
        
        if result.results[0].flagged:
            categories = [k for k, v in result.results[0].categories.items() if v]
            return {'approved': False, 'reason': f'Violations: {categories}'}
        
        return {'approved': True}

moderator = ContentModerator()
result = moderator.moderate_text("User comment here")

AI moderation creates safer online communities while scaling efficiently!

🏷️ Tags:
content moderation AI safety toxic content detection content filtering

📚 Related Articles