AI4Poetry / common /utils.py
ngartiexauce
Initial clean snapshot (history reset)
45c71f5
import random
import re
from typing import List, Tuple, Optional, Dict
import nltk
from nltk.corpus import wordnet as wn
try:
nltk.data.find('corpora/wordnet')
except LookupError:
nltk.download('wordnet', quiet=True)
BANNED_WORDS = {
"fuck", "shit", "damn", "sex", "violence", "kill", "death", "weapon",
"drug", "hate", "stupid", "idiot", "dumb", "ugly", "fat", "racist"
}
UNSAFE_KEYWORDS = {
"violence", "weapon", "gun", "knife", "blood", "murder", "kill", "death",
"sex", "sexual", "porn", "nude", "naked", "drug", "cocaine", "weed",
"hate", "racist", "nazi", "terror", "bomb", "suicide"
}
def check_input_safety(text: str) -> Tuple[bool, str]:
"""Check if input text is safe for children.
Returns:
Tuple of (is_safe: bool, reason: str)
"""
if not text or not text.strip():
return True, ""
text_lower = text.lower()
# Check for banned words
for word in BANNED_WORDS:
if word in text_lower:
return False, f"Input contains inappropriate word: '{word}'"
# Check for unsafe keywords
for keyword in UNSAFE_KEYWORDS:
if keyword in text_lower:
return False, f"Input contains unsafe keyword: '{keyword}'"
# Check for excessive caps (yelling)
if len(text) > 10 and sum(1 for c in text if c.isupper()) / len(text) > 0.7:
return False, "Please don't use all caps"
return True, ""
def filter_output(text: str) -> str:
"""Filter and clean poem output from model."""
# Remove common unwanted patterns
text = re.sub(r'(?i)(here is|here\'s).*?(poem|verse).*?:', '', text)
text = re.sub(r'(?i)^(poem|title|verse).*?:', '', text, flags=re.MULTILINE)
text = re.sub(r'\*\*.*?\*\*', '', text) # Remove markdown bold
text = re.sub(r'#{1,6}\s+.*', '', text) # Remove markdown headers
text = re.sub(r'---+', '', text) # Remove separators
text = re.sub(r'##\s+Guidelines.*', '', text, flags=re.DOTALL) # Remove guidelines if leaked
text = re.sub(r'<.*?>', '', text) # Remove any remaining placeholders
# Split into lines and filter
lines = [line.strip() for line in text.split('\n') if line.strip()]
# Remove lines that look like instructions or metadata
poem_lines = []
for line in lines:
lower_line = line.lower()
# Skip instruction-like lines
if any(skip in lower_line for skip in ['guideline', 'parameter', 'instruction', 'format', 'length:', 'age:', 'theme:', 'interest:', 'description:', '- **', 'output only']):
continue
# Skip numbered instruction lines
if re.match(r'^\d+\.\s+\*\*', line):
continue
poem_lines.append(line)
# Rejoin the clean poem
result = '\n'.join(poem_lines)
# Filter banned words
tokens = result.split()
filtered = []
for token in tokens:
clean = token.lower().strip('.,!?')
if clean in BANNED_WORDS:
filtered.append("***")
else:
filtered.append(token)
return " ".join(filtered)
def load_prompt_template(filepath: str) -> str:
"""Load prompt template from markdown file."""
with open(filepath, 'r', encoding='utf-8') as f:
return f.read()
def fill_prompt_template(template: str, **kwargs) -> str:
"""Replace placeholders like <age>, <theme> with actual values."""
result = template
for key, value in kwargs.items():
placeholder = f"<{key}>"
result = result.replace(placeholder, str(value))
return result
def select_words_to_blank(poem: str, difficulty: str, age: int, model) -> List[str]:
"""Use AI model to intelligently select words to blank based on difficulty and age.
Args:
poem: The complete poem text
difficulty: Easy, Medium, or Hard
age: Reader's age for appropriate word selection
model: The PoetryModel instance
Returns:
List of words to blank out (lowercase)
"""
if difficulty == "Easy":
n_words = 3
instruction = "3 simple, common words that are easy to guess from context"
elif difficulty == "Medium":
n_words = 6
instruction = "6 moderately challenging words with some ambiguity"
else: # Hard
n_words = 9
instruction = "9 key thematic words and challenging vocabulary with high ambiguity"
prompt = f"""Select exactly {n_words} words from this poem to remove for a fill-in-the-blank exercise.
Age: {age} years old
Difficulty: {difficulty}
Poem:
{poem}
Instructions:
- Choose {instruction}
- For {difficulty} difficulty, select words appropriate for age {age}
- Consider context clues and ambiguity level
- Return ONLY the {n_words} words, one per line, nothing else
Selected words:"""
try:
response = model.generate(prompt, max_tokens=512, temperature=0.3)
print(f"\n{'='*60}\nAI WORD SELECTION\n{'='*60}")
print(f"Difficulty: {difficulty} | Age: {age}")
print(f"AI Response:\n{response}")
# Parse the response to extract words
selected = []
for line in response.strip().split('\n'):
word = line.strip().strip('.,!?-*•"\'1234567890. ').lower()
if word and len(word) > 1 and word not in selected:
selected.append(word)
if len(selected) >= n_words:
break
print(f"Parsed words: {selected}")
print(f"{'='*60}\n")
return selected[:n_words] if selected else []
except Exception as e:
print(f"Error selecting words with model: {e}")
return []
def create_fill_in_blank(poem: str, difficulty: str, selected_words: List[str] = None) -> Tuple[str, List[str], List[int], List[str]]:
"""Create fill-in-blank exercise from poem with exact positions.
Args:
poem: The complete poem text
difficulty: Easy (3 blanks), Medium (6 blanks), Hard (9 blanks)
selected_words: Optional list of specific words to blank (from AI model)
Returns:
Tuple of (blanked_poem, correct_answers_in_order, positions, all_poem_words)
"""
words = poem.split()
# Determine number of blanks based on difficulty
if difficulty == "Easy":
n_blanks = 3
elif difficulty == "Medium":
n_blanks = 6
else: # Hard
n_blanks = 9
# If specific words provided by AI model, use those
blank_indices = []
if selected_words:
for idx, word in enumerate(words):
clean_word = word.strip('.,!?').lower()
if clean_word in [w.lower() for w in selected_words]:
blank_indices.append(idx)
if len(blank_indices) >= n_blanks:
break
# Fallback if AI didn't provide enough words
if len(blank_indices) < n_blanks:
candidates = [i for i, w in enumerate(words) if len(w.strip('.,!?')) > 4 and i not in blank_indices]
if len(candidates) < (n_blanks - len(blank_indices)):
candidates = [i for i, w in enumerate(words) if len(w.strip('.,!?')) > 3 and i not in blank_indices]
additional = random.sample(candidates, min(n_blanks - len(blank_indices), len(candidates)))
blank_indices.extend(additional)
if not blank_indices:
return poem, [], [], words
# Sort positions for consistent ordering
blank_indices = sorted(blank_indices[:n_blanks])
# Store correct answers in order of appearance
correct_answers = []
positions = []
blanked_words = words.copy()
for idx in blank_indices:
original_word = words[idx].strip('.,!?')
correct_answers.append(original_word)
positions.append(idx)
# Replace with uniform blank (15 underscores)
blanked_words[idx] = "_______________"
blanked_poem = " ".join(blanked_words)
return blanked_poem, correct_answers, positions, words
def get_word_definition(word: str) -> Optional[str]:
"""Get word definition from WordNet."""
synsets = wn.synsets(word)
if synsets:
return synsets[0].definition().lower()
return None
def rank_definitions(definitions: List[str], correct_definition: str, use_ai: bool = False, model = None) -> List[Tuple[int, float]]:
"""Rank player definitions by similarity to correct definition.
Returns:
List of tuples (player_index, score) sorted by score descending
"""
def jaccard(a: str, b: str) -> float:
a_set = set(a.split())
b_set = set(b.split())
if not a_set or not b_set:
return 0.0
return len(a_set & b_set) / len(a_set | b_set)
scores = []
for idx, definition in enumerate(definitions):
if not definition or not definition.strip():
scores.append((idx, 0.0))
continue
# Calculate Jaccard similarity
jaccard_score = jaccard(definition.lower(), correct_definition.lower())
# If AI scoring is enabled and model is provided
if use_ai and model:
try:
prompt = f"""Rate how well this definition matches the correct definition on a scale of 0-10.
Correct definition: {correct_definition}
Player definition: {definition}
Respond with only a number between 0 and 10."""
ai_response = model.generate(prompt, max_tokens=128, temperature=0.3)
# Extract number from response
ai_score = float(re.findall(r'\d+\.?\d*', ai_response)[0]) / 10.0 if re.findall(r'\d+\.?\d*', ai_response) else 0.0
# Combine AI score (70%) and Jaccard score (30%)
final_score = 0.7 * ai_score + 0.3 * jaccard_score
except:
final_score = jaccard_score
else:
final_score = jaccard_score
scores.append((idx, final_score))
scores.sort(key=lambda x: x[1], reverse=True)
return scores
def load_vocabulary(filepath: str) -> dict:
"""Load vocabulary dictionary from markdown file."""
vocab = {"Easy": [], "Medium": [], "Hard": []}
current_level = None
with open(filepath, 'r', encoding='utf-8') as f:
for line in f:
line = line.strip()
if line.startswith("## Easy"):
current_level = "Easy"
elif line.startswith("## Medium"):
current_level = "Medium"
elif line.startswith("## Hard"):
current_level = "Hard"
elif line and current_level and line.startswith("-"):
word = line.lstrip("- ").strip()
vocab[current_level].append(word)
return vocab
def load_themes(filepath: str) -> List[str]:
"""Load themes from markdown file."""
themes = []
try:
with open(filepath, 'r', encoding='utf-8') as f:
for line in f:
line = line.strip()
if line and line.startswith("-"):
theme = line.lstrip("- ").strip()
if theme:
themes.append(theme)
except FileNotFoundError:
# Return default themes if file not found
themes = ["Nature", "Animals", "Friendship", "Adventure", "Family", "Seasons", "Ocean", "Space", "Dreams", "Magic"]
return themes
def load_interests(filepath: str) -> List[str]:
"""Load interests/hobbies from markdown file."""
interests = []
try:
with open(filepath, 'r', encoding='utf-8') as f:
for line in f:
line = line.strip()
if line and line.startswith("-"):
interest = line.lstrip("- ").strip()
if interest:
interests.append(interest)
except FileNotFoundError:
# Return default interests if file not found
interests = [
"Sports", "Music", "Art", "Reading", "Dancing", "Video Games",
"Dinosaurs", "Superheroes", "Princesses", "Science", "Cooking",
"Animals", "Cars", "Robots", "Movies", "Swimming"
]
return interests
def save_leaderboard_score(filepath: str, difficulty: str, player_name: str, score: float):
"""Save a player's score to the leaderboard file."""
import json
import os
# Load existing leaderboard or create new
if os.path.exists(filepath):
with open(filepath, 'r', encoding='utf-8') as f:
leaderboard = json.load(f)
else:
leaderboard = {"Easy": [], "Medium": [], "Hard": []}
# Add new score
if difficulty in leaderboard:
leaderboard[difficulty].append({"name": player_name, "score": score})
# Sort by score descending and keep top 100
leaderboard[difficulty].sort(key=lambda x: x["score"], reverse=True)
leaderboard[difficulty] = leaderboard[difficulty][:100]
# Save back to file
with open(filepath, 'w', encoding='utf-8') as f:
json.dump(leaderboard, f, indent=2)
def get_leaderboard(filepath: str, difficulty: str, top_n: int = 10) -> List[Dict]:
"""Get top N players from leaderboard for a difficulty level."""
import json
import os
if not os.path.exists(filepath):
return []
try:
with open(filepath, 'r', encoding='utf-8') as f:
leaderboard = json.load(f)
if difficulty in leaderboard:
return leaderboard[difficulty][:top_n]
except:
return []
return []