Spaces:

DeadPool1236
/

Well-Being-Agent

Sleeping

App Files Files Community

Well-Being-Agent / audio_processor.py

DeadPool1236

Upload 11 files

cb55d04 verified about 1 month ago

raw

history blame contribute delete

14.3 kB

	# audio_processor.py - FREE TTS and STT for English AND Urdu voice notes
	import os
	import tempfile
	import logging
	import time
	from typing import Optional, Dict, Any
	from fastapi import HTTPException, UploadFile
	import uuid
	import re

	logger = logging.getLogger(__name__)

	class AudioProcessor:
	"""FREE Audio processing system for STT and TTS functionality (English + Urdu ONLY)"""

	def __init__(self):
	self.supported_languages = ["english", "urdu"]
	logger.info("🎵 FREE Audio Processor initialized - Supporting English & Urdu ONLY")

	async def speech_to_text(self, audio_file: UploadFile, language: str = "auto") -> Dict[str, Any]:
	"""
	Convert speech to text using FREE STT services for English AND Urdu ONLY
	"""
	try:
	logger.info(f"🎤 Converting speech to text - Language: {language}")

	# Read audio file
	audio_content = await audio_file.read()

	# Try local Whisper for multilingual support
	stt_result = await self._try_whisper_stt(audio_content, language)
	if stt_result:
	# Verify detected language is only Urdu or English
	detected_language = self._strict_detect_language_from_text(stt_result["text"])
	if detected_language not in ["english", "urdu"]:
	logger.warning(f"⚠️ Detected non-supported language: {detected_language}, treating as English")
	detected_language = "english"

	stt_result["language"] = detected_language
	return stt_result

	# Fallback to SpeechRecognition with Google Web API (mainly English)
	stt_result = await self._try_speech_recognition(audio_content)
	if stt_result:
	detected_language = self._strict_detect_language_from_text(stt_result["text"])
	if detected_language not in ["english", "urdu"]:
	detected_language = "english"

	stt_result["language"] = detected_language
	return stt_result

	raise HTTPException(status_code=400, detail="No FREE STT service available")

	except Exception as e:
	logger.error(f"❌ STT Error: {e}")
	raise HTTPException(status_code=500, detail=f"Speech recognition failed: {str(e)}")

	async def _try_whisper_stt(self, audio_content: bytes, language: str = "auto") -> Optional[Dict[str, Any]]:
	"""Try local Whisper model with strict language filtering"""
	try:
	import whisper

	# Create temporary file
	with tempfile.NamedTemporaryFile(suffix=".webm", delete=False) as temp_audio:
	temp_audio.write(audio_content)
	temp_audio_path = temp_audio.name

	try:
	logger.info("🔊 Using local Whisper (English/Urdu)...")
	model = whisper.load_model("base")

	# Set language parameter for Whisper - only allow English or Urdu
	whisper_language = None
	if language == "urdu":
	whisper_language = "urdu"
	elif language == "english":
	whisper_language = "english"
	# For "auto", let Whisper detect but we'll filter later

	result = model.transcribe(temp_audio_path, language=whisper_language)

	# Apply strict language detection
	detected_language = self._strict_detect_language_from_text(result["text"])

	return {
	"text": result["text"].strip(),
	"language": detected_language,
	"service": "local_whisper",
	"confidence": 0.8
	}
	finally:
	# Ensure temp file cleanup
	if os.path.exists(temp_audio_path):
	try:
	os.unlink(temp_audio_path)
	except Exception as cleanup_error:
	logger.warning(f"⚠️ Failed to cleanup temp file: {cleanup_error}")

	except ImportError:
	logger.warning("Whisper not available for local STT")
	return None
	except Exception as e:
	logger.warning(f"Local Whisper STT failed: {e}")
	return None

	async def _try_speech_recognition(self, audio_content: bytes) -> Optional[Dict[str, Any]]:
	"""Try SpeechRecognition with Google Web API (mainly English)"""
	try:
	import speech_recognition as sr
	from pydub import AudioSegment
	import io

	# Convert webm to wav for SpeechRecognition
	audio = AudioSegment.from_file(io.BytesIO(audio_content), format="webm")
	wav_data = io.BytesIO()
	audio.export(wav_data, format="wav")
	wav_data.seek(0)

	recognizer = sr.Recognizer()

	with sr.AudioFile(wav_data) as source:
	audio_data = recognizer.record(source)
	text = recognizer.recognize_google(audio_data)

	# Apply strict language detection
	detected_language = self._strict_detect_language_from_text(text)

	return {
	"text": text,
	"language": detected_language,
	"service": "google_web_api",
	"confidence": 0.7
	}

	except ImportError:
	logger.warning("SpeechRecognition not available")
	return None
	except Exception as e:
	logger.warning(f"SpeechRecognition failed: {e}")
	return None

	def _strict_detect_language_from_text(self, text: str) -> str:
	"""
	Strict language detection that only identifies Urdu or English
	Specifically excludes Hindi, Arabic, and other languages
	"""
	try:
	text = text.strip()
	if not text:
	return "english" # Default to English for empty text

	# === STRICT URDU DETECTION ===
	# Urdu-specific character ranges (excluding Arabic and Hindi overlaps)
	urdu_specific_ranges = [
	r'[\u0679-\u0679]', # Urdu-specific letters
	r'[\u067E-\u067E]', # Peh
	r'[\u0686-\u0686]', # Cheh
	r'[\u0688-\u0688]', # Ddal
	r'[\u0691-\u0691]', # Rreh
	r'[\u0698-\u0698]', # Jeh
	r'[\u06A9-\u06A9]', # Keheh
	r'[\u06AF-\u06AF]', # Gaf
	r'[\u06BA-\u06BA]', # Noon Ghunna
	r'[\u06BE-\u06BE]', # Heh Doachashmee
	r'[\u06C1-\u06C1]', # Heh Goal
	r'[\u06C2-\u06C2]', # Heh Goal with Hamza Above
	r'[\u06CC-\u06CC]', # Farsi Yeh
	r'[\u06D2-\u06D2]', # Yeh Barree
	]

	# Common Urdu words that are distinct from Hindi/Arabic
	urdu_specific_words = [
	'ہے', 'ہیں', 'ہوں', 'کیا', 'کے', 'کو', 'سے', 'پر', 'میں',
	'اور', 'لیکن', 'اگر', 'تو', 'بھی', 'ہی', 'تھا', 'تھی',
	'تھے', 'ہو', 'رہا', 'رہی', 'رہے', 'دیں', 'دی', 'دو', 'دیجیے',
	'برائے', 'کےلیے', 'کےساتھ', 'کےبعد', 'کےپاس', 'کےنیچے'
	]

	# Check for Urdu-specific characters
	urdu_char_count = 0
	for pattern in urdu_specific_ranges:
	urdu_char_count += len(re.findall(pattern, text))

	# Check for Urdu-specific words
	urdu_word_count = sum(1 for word in urdu_specific_words if word in text)

	# Check for common Urdu sentence structures
	urdu_indicators = [
	' کا ', ' کی ', ' کے ', ' کو ', ' سے ', ' پر ', ' میں ', ' نے ',
	' ہی ', ' بھی ', ' تو ', ' اگر ', ' لیکن ', ' اور ', ' یا '
	]
	urdu_structure_count = sum(1 for indicator in urdu_indicators if indicator in text)

	# === HINDI EXCLUSION ===
	# Hindi-specific characters and words to exclude
	hindi_specific_chars = r'[\u0900-\u097F]' # Devanagari range
	hindi_char_count = len(re.findall(hindi_specific_chars, text))

	hindi_specific_words = ['है', 'हो', 'की', 'के', 'को', 'से', 'में', 'ना', 'नी', 'ने']
	hindi_word_count = sum(1 for word in hindi_specific_words if word in text)

	# === ARABIC EXCLUSION ===
	# Arabic-specific characters (excluding common Urdu-Arabic overlaps)
	arabic_specific_chars = r'[\uFE70-\uFEFF]' # Arabic presentation forms
	arabic_char_count = len(re.findall(arabic_specific_chars, text))

	# === ENGLISH DETECTION ===
	english_words = [
	'the', 'and', 'you', 'that', 'was', 'for', 'are', 'with', 'his', 'they',
	'this', 'have', 'from', 'one', 'had', 'word', 'but', 'not', 'what', 'all',
	'were', 'when', 'your', 'can', 'said', 'there', 'each', 'which', 'she', 'do',
	'how', 'their', 'will', 'other', 'about', 'out', 'many', 'then', 'them', 'these'
	]
	text_lower = text.lower()
	english_score = sum(1 for word in english_words if word in text_lower)

	# === LANGUAGE DECISION LOGIC ===

	# First, exclude Hindi and Arabic
	if hindi_char_count > 2 or hindi_word_count > 1:
	logger.info("🔍 Hindi detected, treating as English")
	return "english"

	if arabic_char_count > 2:
	logger.info("🔍 Arabic detected, treating as English")
	return "english"

	# Then detect Urdu with high confidence
	urdu_confidence_score = (
	urdu_char_count * 2 +
	urdu_word_count * 3 +
	urdu_structure_count * 1.5
	)

	# Strong Urdu detection thresholds
	if urdu_confidence_score >= 5:
	logger.info(f"🔍 Urdu detected (confidence: {urdu_confidence_score})")
	return "urdu"

	# English detection
	if english_score >= 3 or len(text.split()) >= 4:
	logger.info(f"🔍 English detected (score: {english_score})")
	return "english"

	# If we have some Urdu indicators but not enough for confident detection
	if urdu_confidence_score >= 2:
	logger.info(f"🔍 Weak Urdu signals, treating as Urdu (confidence: {urdu_confidence_score})")
	return "urdu"

	# Default to English
	logger.info("🔍 Defaulting to English")
	return "english"

	except Exception as e:
	logger.error(f"❌ Language detection error: {e}")
	return "english" # Safe default

	def _detect_language_from_text(self, text: str) -> str:
	"""Legacy method for backward compatibility"""
	return self._strict_detect_language_from_text(text)

	async def text_to_speech(self, text: str, language: str = "english") -> Optional[Dict[str, Any]]:
	"""
	Convert text to speech using FREE TTS services
	NOTE: Keeping TTS for potential future use, but currently disabled for responses
	"""
	try:
	# Since we're only returning text responses now, TTS is optional
	# But keeping the function for potential future use
	logger.info(f"🔊 TTS requested for {language}: {text[:50]}...")
	return None # Disable TTS for now

	except Exception as e:
	logger.error(f"❌ TTS Error: {e}")
	return None

	async def cleanup_old_audio_files(self, max_age_hours: int = 1):
	"""Clean up audio files older than specified hours"""
	try:
	audio_dir = os.path.join("static", "audio")
	if not os.path.exists(audio_dir):
	return

	current_time = time.time()
	deleted_count = 0

	for filename in os.listdir(audio_dir):
	if filename.startswith("tts_") and (filename.endswith(".mp3") or filename.endswith(".wav")):
	file_path = os.path.join(audio_dir, filename)
	if os.path.isfile(file_path):
	# Delete files older than max_age_hours
	file_age_hours = (current_time - os.path.getctime(file_path)) / 3600
	if file_age_hours > max_age_hours:
	try:
	os.remove(file_path)
	deleted_count += 1
	logger.info(f"🧹 Cleaned up old audio file: {filename}")
	except Exception as cleanup_error:
	logger.warning(f"⚠️ Failed to cleanup audio file {filename}: {cleanup_error}")

	if deleted_count > 0:
	logger.info(f"🧹 Cleaned up {deleted_count} old audio file(s)")

	except Exception as e:
	logger.error(f"Error cleaning up audio files: {e}")

	# Global audio processor instance
	audio_processor = AudioProcessor()