import gradio as gr
import pandas as pd
import requests
from bs4 import BeautifulSoup
from functools import lru_cache
import csv
from io import StringIO
import re

# Configuration
LANGUAGE_CODES_FILE = "language-codes-full.csv"
APP_CONTENT_FILE = "app_content.md"
LANGUAGE_TAXONOMY_URL = "https://microsoft.github.io/linguisticdiversity/assets/lang2tax.txt"

# Language list will be loaded from CSV
# Structure: {alpha3_b: {"name": str, "alpha3_t": str, "alpha2": str}}
LANGUAGES = {}

# Language taxonomy mapping (from Joshi et al.'s linguistic diversity paper)
# Structure: {language_name_lowercase: level}
LANGUAGE_TAXONOMY = {}

# Taxonomy level descriptions
TAXONOMY_LEVELS = {
    0: "The Left-Behinds",
    1: "The Scraping-Bys",
    2: "The Hopefuls",
    3: "The Rising Stars",
    4: "The Underdogs",
    5: "The Winners"
}

# App content will be loaded from markdown file
APP_CONTENT = {
    "title": "Speech Resource Finder",
    "description": "Search for speech resources",
    "full_content": ""
}

def load_app_content(content_path=None):
    """Load app content from markdown file"""
    global APP_CONTENT
    if content_path is None:
        content_path = APP_CONTENT_FILE

    try:
        with open(content_path, 'r', encoding='utf-8') as f:
            content = f.read()

        # Parse markdown content
        lines = content.split('\n')

        # Extract title (first # heading)
        title = "Speech Resource Finder"
        for line in lines:
            if line.startswith('# '):
                title = line[2:].strip()
                break

        # Extract description (text after ## Description until next ##)
        description = ""
        in_description = False
        for line in lines:
            if line.startswith('## Description'):
                in_description = True
                continue
            elif in_description and line.startswith('##'):
                break
            elif in_description and line.strip():
                description += line.strip() + " "

        APP_CONTENT = {
            "title": title,
            "description": description.strip(),
            "full_content": content
        }
        print(f"Loaded app content from {content_path}")
    except Exception as e:
        print(f"Error loading app content: {e}")
        print("Using default content")

def load_language_list(csv_path=None):
    """Load ISO 639 language codes from CSV file"""
    global LANGUAGES
    if csv_path is None:
        csv_path = LANGUAGE_CODES_FILE

    try:
        with open(csv_path, 'r', encoding='utf-8') as f:
            reader = csv.DictReader(f)
            for row in reader:
                # Use alpha3-b as primary key, fallback to alpha3-t if empty
                code_b = row['alpha3-b'].strip()
                code_t = row['alpha3-t'].strip()
                code_2 = row['alpha2'].strip()
                name = row['English'].strip()

                primary_code = code_b if code_b else code_t

                if primary_code and name:
                    LANGUAGES[primary_code] = {
                        "name": name,
                        "alpha3_b": code_b,
                        "alpha3_t": code_t,
                        "alpha2": code_2
                    }
        print(f"Loaded {len(LANGUAGES)} languages from {csv_path}")
    except Exception as e:
        print(f"Error loading language list: {e}")
        # Fallback to a minimal set
        LANGUAGES = {
            "eng": {"name": "English", "alpha3_b": "eng", "alpha3_t": "", "alpha2": "en"},
            "spa": {"name": "Spanish", "alpha3_b": "spa", "alpha3_t": "", "alpha2": "es"},
            "fra": {"name": "French", "alpha3_b": "fra", "alpha3_t": "", "alpha2": "fr"},
            "deu": {"name": "German", "alpha3_b": "ger", "alpha3_t": "deu", "alpha2": "de"},
        }
        print(f"Using fallback with {len(LANGUAGES)} languages")

def load_language_taxonomy():
    """Load language taxonomy data from Microsoft's linguistic diversity project"""
    global LANGUAGE_TAXONOMY

    try:
        response = requests.get(LANGUAGE_TAXONOMY_URL, timeout=10)
        response.raise_for_status()

        # Parse the CSV-like content (format: language_name,level)
        for line in response.text.strip().split('\n'):
            if line.strip():
                parts = line.strip().split(',')
                if len(parts) == 2:
                    lang_name = parts[0].strip().lower()
                    level = int(parts[1].strip())
                    LANGUAGE_TAXONOMY[lang_name] = level

        print(f"Loaded taxonomy data for {len(LANGUAGE_TAXONOMY)} languages")
    except Exception as e:
        print(f"Warning: Could not load language taxonomy: {e}")
        print("Language classification will show as 'Unknown'")

def get_taxonomy_color(level):
    """
    Get color code for taxonomy level (red for left-behind, green for winners)
    """
    colors = {
        0: "#d32f2f",  # Red - The Left-Behinds
        1: "#f57c00",  # Orange - The Scraping-Bys
        2: "#fbc02d",  # Yellow - The Hopefuls
        3: "#afb42b",  # Yellow-green - The Rising Stars
        4: "#7cb342",  # Light green - The Underdogs
        5: "#388e3c",  # Green - The Winners
    }
    return colors.get(level, "#757575")  # Gray for unknown

def get_language_taxonomy_info(language_name):
    """
    Get taxonomy classification for a language.
    Returns a tuple of (level, description) or (None, "Unknown")
    """
    if not language_name:
        return None, "Unknown"

    # Try exact match (case-insensitive)
    lang_lower = language_name.lower()
    if lang_lower in LANGUAGE_TAXONOMY:
        level = LANGUAGE_TAXONOMY[lang_lower]
        return level, TAXONOMY_LEVELS.get(level, f"Level {level}")

    # Try with semicolon-separated alternative names (e.g., "Catalan; Valencian")
    if ';' in lang_lower:
        parts = [p.strip() for p in lang_lower.split(';')]
        for part in parts:
            if part in LANGUAGE_TAXONOMY:
                level = LANGUAGE_TAXONOMY[part]
                return level, TAXONOMY_LEVELS.get(level, f"Level {level}")

    # Try with comma-separated variations (e.g., "Chinese, Mandarin")
    if ',' in lang_lower:
        parts = [p.strip() for p in lang_lower.split(',')]
        for part in parts:
            if part in LANGUAGE_TAXONOMY:
                level = LANGUAGE_TAXONOMY[part]
                return level, TAXONOMY_LEVELS.get(level, f"Level {level}")

    return None, "Unknown"

@lru_cache(maxsize=1)
def fetch_azure_asr_languages():
    """Scrape Azure Speech-to-Text supported languages"""
    url = "https://learn.microsoft.com/en-us/azure/ai-services/speech-service/language-support?tabs=stt"
    
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find the table with locale data
        # The table has columns: Locale (BCP-47) | Language | Fast transcription support | Custom speech support
        tables = soup.find_all('table')
        
        azure_asr = {}
        for table in tables:
            rows = table.find_all('tr')
            if not rows:
                continue
                
            # Check if this is the right table by looking at headers
            headers = [th.get_text(strip=True) for th in rows[0].find_all('th')]
            if 'Locale' in ' '.join(headers) or 'Language' in ' '.join(headers):
                for row in rows[1:]:  # Skip header
                    cols = row.find_all('td')
                    if len(cols) >= 2:
                        locale = cols[0].get_text(strip=True)
                        language = cols[1].get_text(strip=True)
                        if locale and language:
                            azure_asr[locale] = language
                break
        
        return azure_asr
    except Exception as e:
        print(f"Error fetching Azure ASR data: {e}")
        return {}

@lru_cache(maxsize=1)
def fetch_azure_tts_languages():
    """Scrape Azure Text-to-Speech supported languages with voice counts"""
    url = "https://learn.microsoft.com/en-us/azure/ai-services/speech-service/language-support?tabs=tts"

    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find the TTS table
        # Columns: Locale (BCP-47) | Language | Text to speech voices
        tables = soup.find_all('table')

        azure_tts = {}
        for table in tables:
            rows = table.find_all('tr')
            if not rows:
                continue

            headers = [th.get_text(strip=True) for th in rows[0].find_all('th')]
            if 'Text to speech' in ' '.join(headers) or 'voices' in ' '.join(headers).lower():
                for row in rows[1:]:
                    cols = row.find_all('td')
                    if len(cols) >= 3:
                        locale = cols[0].get_text(strip=True)
                        language = cols[1].get_text(strip=True)
                        voices_text = cols[2].get_text(strip=True)
                        # Count number of voices (look for "Neural" in the text)
                        voice_count = voices_text.count('Neural')
                        if locale and language:
                            azure_tts[locale] = {
                                'language': language,
                                'voice_count': voice_count
                            }
                break

        return azure_tts
    except Exception as e:
        print(f"Error fetching Azure TTS data: {e}")
        return {}

@lru_cache(maxsize=1)
def fetch_google_stt_languages():
    """Scrape Google Cloud Speech-to-Text supported languages"""
    url = "https://cloud.google.com/speech-to-text/docs/speech-to-text-supported-languages"

    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find tables with BCP-47 language codes
        tables = soup.find_all('table')

        google_stt = {}
        for table in tables:
            rows = table.find_all('tr')
            if not rows:
                continue

            # Check if this table has BCP-47 column
            headers = [th.get_text(strip=True) for th in rows[0].find_all('th')]

            # Find BCP-47 column index
            bcp47_idx = None
            name_idx = None
            for idx, header in enumerate(headers):
                if 'BCP-47' in header or 'BCP47' in header:
                    bcp47_idx = idx
                if 'Name' in header and name_idx is None:
                    name_idx = idx

            if bcp47_idx is not None:
                for row in rows[1:]:  # Skip header
                    cols = row.find_all('td')
                    if len(cols) > bcp47_idx:
                        locale = cols[bcp47_idx].get_text(strip=True)
                        language = cols[name_idx].get_text(strip=True) if name_idx and len(cols) > name_idx else ''
                        if locale and locale not in ['—', '-', '']:
                            google_stt[locale] = language

        return google_stt
    except Exception as e:
        print(f"Error fetching Google STT data: {e}")
        return {}

@lru_cache(maxsize=1)
def fetch_google_tts_languages():
    """Scrape Google Cloud Text-to-Speech supported languages with voice counts"""
    url = "https://cloud.google.com/text-to-speech/docs/voices"

    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find the voices table
        # Columns: Language | Voice type | Language code | Voice name | SSML Gender | Sample
        tables = soup.find_all('table')

        google_tts = {}
        for table in tables:
            rows = table.find_all('tr')
            if not rows:
                continue

            headers = [th.get_text(strip=True) for th in rows[0].find_all('th')]

            # Find Language code column index
            lang_code_idx = None
            for idx, header in enumerate(headers):
                if 'Language code' in header or 'language code' in header.lower():
                    lang_code_idx = idx
                    break

            if lang_code_idx is not None:
                for row in rows[1:]:
                    cols = row.find_all('td')
                    if len(cols) > lang_code_idx:
                        locale = cols[lang_code_idx].get_text(strip=True)
                        if locale and locale not in ['—', '-', '']:
                            # Count voices per locale
                            if locale in google_tts:
                                google_tts[locale]['voice_count'] += 1
                            else:
                                language = cols[0].get_text(strip=True) if len(cols) > 0 else ''
                                google_tts[locale] = {
                                    'language': language,
                                    'voice_count': 1
                                }

        return google_tts
    except Exception as e:
        print(f"Error fetching Google TTS data: {e}")
        return {}

@lru_cache(maxsize=1)
def fetch_elevenlabs_multilingual_v2():
    """Get ElevenLabs Multilingual v2 supported languages"""
    # Based on https://elevenlabs.io/docs/models#multilingual-v2
    # These are ISO 639-1 (2-letter) codes
    supported_codes = {
        'en', 'ja', 'zh', 'de', 'hi', 'fr', 'ko', 'pt', 'it', 'es',
        'id', 'nl', 'tr', 'fil', 'pl', 'sv', 'bg', 'ro', 'ar', 'cs',
        'el', 'fi', 'hr', 'ms', 'sk', 'da', 'ta', 'uk', 'ru'
    }
    return supported_codes

@lru_cache(maxsize=1)
def fetch_elevenlabs_turbo_v3():
    """Get ElevenLabs Eleven Turbo v3 (formerly v3 Alpha) supported languages"""
    # Based on https://elevenlabs.io/docs/models#eleven-v3-alpha
    # These are ISO 639-3 (3-letter) codes
    supported_codes = {
        'afr', 'ara', 'hye', 'asm', 'aze', 'bel', 'ben', 'bos', 'bul', 'cat',
        'ceb', 'nya', 'hrv', 'ces', 'dan', 'nld', 'eng', 'est', 'fil', 'fin',
        'fra', 'glg', 'kat', 'deu', 'ell', 'guj', 'hau', 'heb', 'hin', 'hun',
        'isl', 'ind', 'gle', 'ita', 'jpn', 'jav', 'kan', 'kaz', 'kir', 'kor',
        'lav', 'lin', 'lit', 'ltz', 'mkd', 'msa', 'mal', 'cmn', 'mar', 'nep',
        'nor', 'pus', 'fas', 'pol', 'por', 'pan', 'ron', 'rus', 'srp', 'snd',
        'slk', 'slv', 'som', 'spa', 'swa', 'swe', 'tam', 'tel', 'tha', 'tur',
        'ukr', 'urd', 'vie', 'cym'
    }
    return supported_codes

@lru_cache(maxsize=1)
def fetch_aws_transcribe_languages():
    """Scrape AWS Transcribe (ASR) supported languages"""
    url = "https://docs.aws.amazon.com/transcribe/latest/dg/supported-languages.html"

    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find tables with language codes
        tables = soup.find_all('table')

        aws_transcribe = {}
        for table in tables:
            rows = table.find_all('tr')
            if not rows:
                continue

            # Check if this table has language code column
            headers = [th.get_text(strip=True) for th in rows[0].find_all('th')]

            # Find language code column index
            lang_code_idx = None
            lang_name_idx = None
            for idx, header in enumerate(headers):
                if 'Language code' in header or 'language code' in header.lower():
                    lang_code_idx = idx
                if 'Language' == header or header.startswith('Language'):
                    lang_name_idx = idx

            if lang_code_idx is not None:
                for row in rows[1:]:  # Skip header
                    cols = row.find_all('td')
                    if len(cols) > lang_code_idx:
                        locale = cols[lang_code_idx].get_text(strip=True)
                        language = cols[lang_name_idx].get_text(strip=True) if lang_name_idx and len(cols) > lang_name_idx else ''
                        if locale and locale not in ['—', '-', '']:
                            aws_transcribe[locale] = language

        return aws_transcribe
    except Exception as e:
        print(f"Error fetching AWS Transcribe data: {e}")
        return {}

@lru_cache(maxsize=1)
def fetch_aws_polly_languages():
    """Scrape AWS Polly (TTS) supported languages"""
    url = "https://docs.aws.amazon.com/polly/latest/dg/supported-languages.html"

    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find tables with language codes
        tables = soup.find_all('table')

        aws_polly = {}
        for table in tables:
            rows = table.find_all('tr')
            if not rows:
                continue

            # Check if this table has language code column
            headers = [th.get_text(strip=True) for th in rows[0].find_all('th')]

            # Find language code column index
            lang_code_idx = None
            lang_name_idx = None
            for idx, header in enumerate(headers):
                if 'Language code' in header or 'language code' in header.lower():
                    lang_code_idx = idx
                if 'Language' == header or header.startswith('Language'):
                    lang_name_idx = idx

            if lang_code_idx is not None:
                for row in rows[1:]:  # Skip header
                    cols = row.find_all('td')
                    if len(cols) > lang_code_idx:
                        locale = cols[lang_code_idx].get_text(strip=True)
                        language = cols[lang_name_idx].get_text(strip=True) if lang_name_idx and len(cols) > lang_name_idx else ''
                        if locale and locale not in ['—', '-', '']:
                            # Count voices per locale (each row is a different voice/locale combo)
                            if locale in aws_polly:
                                aws_polly[locale]['voice_count'] += 1
                            else:
                                aws_polly[locale] = {
                                    'language': language,
                                    'voice_count': 1
                                }

        return aws_polly
    except Exception as e:
        print(f"Error fetching AWS Polly data: {e}")
        return {}

def get_azure_locales_for_language(language_code):
    """
    Get Azure BCP-47 locales for a language using its alpha2 code
    Returns list of matching locales from Azure
    """
    lang_info = LANGUAGES.get(language_code)
    if not lang_info or not lang_info['alpha2']:
        return []

    alpha2 = lang_info['alpha2']
    azure_asr = fetch_azure_asr_languages()
    azure_tts = fetch_azure_tts_languages()

    # Find all locales that start with the alpha2 code
    matching_locales = set()

    for locale in azure_asr.keys():
        if locale.startswith(alpha2 + '-') or locale == alpha2:
            matching_locales.add(locale)

    for locale in azure_tts.keys():
        if locale.startswith(alpha2 + '-') or locale == alpha2:
            matching_locales.add(locale)

    return sorted(matching_locales)

def get_google_locales_for_language(language_code):
    """
    Get Google Cloud BCP-47 locales for a language using its alpha2 code
    Returns list of matching locales from Google Cloud
    """
    lang_info = LANGUAGES.get(language_code)
    if not lang_info or not lang_info['alpha2']:
        return []

    alpha2 = lang_info['alpha2']
    google_stt = fetch_google_stt_languages()
    google_tts = fetch_google_tts_languages()

    # Find all locales that start with the alpha2 code
    matching_locales = set()

    for locale in google_stt.keys():
        if locale.startswith(alpha2 + '-') or locale == alpha2:
            matching_locales.add(locale)

    for locale in google_tts.keys():
        if locale.startswith(alpha2 + '-') or locale == alpha2:
            matching_locales.add(locale)

    return sorted(matching_locales)

def check_elevenlabs_multilingual_v2_support(language_code):
    """
    Check if ElevenLabs Multilingual v2 supports a language using ISO 639-1 (alpha2) codes
    Returns True if supported, False otherwise
    """
    lang_info = LANGUAGES.get(language_code)
    if not lang_info:
        return False

    supported_codes = fetch_elevenlabs_multilingual_v2()

    # Check alpha2 code (2-letter code)
    if lang_info['alpha2'] and lang_info['alpha2'] in supported_codes:
        return True

    return False

def check_elevenlabs_turbo_v3_support(language_code):
    """
    Check if ElevenLabs Turbo v3 supports a language using ISO 639-3 (alpha3) codes
    Returns True if supported, False otherwise
    """
    lang_info = LANGUAGES.get(language_code)
    if not lang_info:
        return False

    supported_codes = fetch_elevenlabs_turbo_v3()

    # Check alpha3_b code first (3-letter code, bibliographic)
    if lang_info['alpha3_b'] and lang_info['alpha3_b'] in supported_codes:
        return True

    # Check alpha3_t code (3-letter code, terminological)
    if lang_info['alpha3_t'] and lang_info['alpha3_t'] in supported_codes:
        return True

    return False

def get_aws_locales_for_language(language_code):
    """
    Get AWS locales for a language using its alpha2 code
    Returns list of matching locales from AWS Transcribe and Polly
    """
    lang_info = LANGUAGES.get(language_code)
    if not lang_info or not lang_info['alpha2']:
        return []

    alpha2 = lang_info['alpha2']
    aws_transcribe = fetch_aws_transcribe_languages()
    aws_polly = fetch_aws_polly_languages()

    # Find all locales that start with the alpha2 code
    matching_locales = set()

    for locale in aws_transcribe.keys():
        if locale.startswith(alpha2 + '-') or locale == alpha2:
            matching_locales.add(locale)

    for locale in aws_polly.keys():
        if locale.startswith(alpha2 + '-') or locale == alpha2:
            matching_locales.add(locale)

    return sorted(matching_locales)

def search_huggingface_models(language_code, pipeline_tag, max_results=100, max_pages=3):
    """
    Search HuggingFace for models supporting a specific language
    pipeline_tag: 'automatic-speech-recognition' or 'text-to-speech'
    max_results: maximum number of models to return
    max_pages: maximum number of pages to search per language code
    Returns tuple: (list of model dictionaries, log messages)
    """
    lang_info = LANGUAGES.get(language_code)
    logs = []

    if not lang_info:
        logs.append(f"No language info found for code: {language_code}")
        return [], logs

    # Try multiple language code formats
    codes_to_try = []
    if lang_info['alpha2']:
        codes_to_try.append(lang_info['alpha2'])  # 2-letter code
    if lang_info['alpha3_b']:
        codes_to_try.append(lang_info['alpha3_b'])  # 3-letter code
    if lang_info['alpha3_t']:
        codes_to_try.append(lang_info['alpha3_t'])  # 3-letter terminological

    logs.append(f"Language codes to search: {set(codes_to_try)}")

    models = []
    seen_models = set()

    for code in codes_to_try:
        if len(models) >= max_results:
            break

        logs.append(f"Searching for language code: {code}")

        # Try multiple pages for this language code
        for page in range(max_pages):
            if len(models) >= max_results:
                break

            try:
                # Use HuggingFace model search with pagination
                url = f"https://huggingface.co/models?pipeline_tag={pipeline_tag}&language={code}&sort=trending"
                if page > 0:
                    url += f"&p={page}"

                logs.append(f"  Page {page}: {url}")

                headers = {
                    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
                }

                response = requests.get(url, headers=headers, timeout=10)
                response.raise_for_status()

                soup = BeautifulSoup(response.content, 'html.parser')

                # Parse model cards from the page
                model_cards = soup.find_all('article', class_='overview-card-wrapper')

                if not model_cards:
                    logs.append(f"  No model cards found on page {page}")
                    break

                logs.append(f"  Found {len(model_cards)} model cards on page {page}")

                for card in model_cards:
                    if len(models) >= max_results:
                        break

                    try:
                        link = card.find('a', href=True)
                        if link:
                            href = link.get('href', '')
                            model_name = href.lstrip('/')

                            if model_name and model_name != '#' and model_name not in seen_models:
                                seen_models.add(model_name)

                                # Parse stats directly from the card HTML by looking at SVG icons
                                downloads = 0
                                likes = 0
                                size = ""

                                # Find all SVG elements in the card
                                svgs = card.find_all('svg')

                                for svg in svgs:
                                    # Get the next sibling text after the SVG
                                    # Could be direct text or text within a span/other element
                                    next_elem = svg.find_next_sibling(string=True)
                                    stat_text = ""

                                    if next_elem and next_elem.strip():
                                        stat_text = next_elem.strip()
                                    else:
                                        # Try to find text in the next sibling element (e.g., <span>)
                                        next_tag = svg.find_next_sibling()
                                        if next_tag:
                                            stat_text = next_tag.get_text(strip=True)

                                    if not stat_text or len(stat_text) < 1:
                                        continue

                                    # Identify icon type by viewBox or path content
                                    svg_str = str(svg)

                                    # Download icon: viewBox="0 0 32 32" with download arrow path
                                    if 'M26 24v4H6v-4H4v4a2 2 0 0 0 2 2h20a2 2 0 0 0 2-2v-4zm0-10l-1.41-1.41L17 20.17V2h-2v18.17l-7.59-7.58L6 14l10 10l10-10z' in svg_str:
                                        downloads = parse_stat_number(stat_text)

                                    # Like/heart icon: heart path
                                    elif 'M22.45,6a5.47,5.47,0,0,1,3.91,1.64,5.7,5.7,0,0,1,0,8L16,26.13' in svg_str:
                                        likes = parse_stat_number(stat_text)

                                    # Model size icon: small grid icon (viewBox="0 0 12 12") with specific path for parameter count
                                    elif 'M10 10H8.4V8.4H10V10Zm0-3.2H8.4V5.2H10v1.6ZM6.8 10H5.2V8.4h1.6V10Z' in svg_str:
                                        # Model parameter count (e.g., "2B", "0.6B")
                                        # Must be short and contain B for billion params
                                        if len(stat_text) <= 6 and re.search(r'\d+\.?\d*\s*[Bb]', stat_text):
                                            size = stat_text

                                models.append({
                                    'name': model_name,
                                    'url': f"https://huggingface.co/{model_name}",
                                    'downloads': downloads,
                                    'likes': likes,
                                    'size': size
                                })
                    except Exception as e:
                        logs.append(f"  Error parsing model card: {e}")
                        continue

            except Exception as e:
                logs.append(f"  ERROR searching page {page}: {e}")
                break

    # Sort by downloads (descending)
    models.sort(key=lambda x: x['downloads'], reverse=True)

    logs.append(f"Total unique models found: {len(models)}")
    return models, logs

def get_huggingface_stats(item_name, item_type='datasets'):
    """
    Get likes and downloads for a HuggingFace dataset or model using API
    item_type: 'datasets' or 'models'
    Returns dict with likes and downloads

    NOTE: This method is currently NOT USED. We parse stats directly from HTML instead.
    Keeping it here as a fallback in case HTML parsing fails.
    """
    try:
        api_url = f"https://huggingface.co/api/{item_type}/{item_name}"
        response = requests.get(api_url, timeout=5)

        if response.status_code == 200:
            data = response.json()
            return {
                'likes': data.get('likes', 0),
                'downloads': data.get('downloads', 0)
            }
    except Exception:
        pass

    return {'likes': 0, 'downloads': 0}

def parse_stat_number(stat_text):
    """
    Parse HuggingFace stat numbers like '4.07M', '23.4k', '349' into integers
    Returns integer value or 0 if parsing fails
    """
    if not stat_text:
        return 0

    stat_text = stat_text.strip().upper()

    try:
        # Handle 'M' (millions)
        if 'M' in stat_text:
            return int(float(stat_text.replace('M', '')) * 1_000_000)
        # Handle 'K' (thousands)
        elif 'K' in stat_text:
            return int(float(stat_text.replace('K', '')) * 1_000)
        # Plain number
        else:
            return int(stat_text.replace(',', ''))
    except (ValueError, AttributeError):
        return 0

def deduplicate_models(models):
    """
    Deduplicate models by base name (without user/org prefix)
    Keep the model with most downloads and count duplicates
    Returns list of deduplicated models with duplicate count added
    """
    from collections import defaultdict

    # Group models by base name
    grouped = defaultdict(list)
    for model in models:
        # Extract base name (everything after last '/')
        name_parts = model['name'].split('/')
        if len(name_parts) > 1:
            base_name = name_parts[-1]  # e.g., "whisper-large-v3"
        else:
            base_name = model['name']

        grouped[base_name].append(model)

    # For each group, keep the one with most downloads
    deduplicated = []
    for base_name, model_list in grouped.items():
        # Sort by downloads (descending) and keep the first one
        model_list.sort(key=lambda x: x['downloads'], reverse=True)
        best_model = model_list[0]

        # Add duplicate count (total in group)
        best_model['duplicates'] = len(model_list) - 1

        deduplicated.append(best_model)

    # Sort by downloads again
    deduplicated.sort(key=lambda x: x['downloads'], reverse=True)

    return deduplicated

def search_huggingface_datasets(language_code, task_category, max_results=100, max_pages=3):
    """
    Search HuggingFace for datasets supporting a specific language
    task_category: 'automatic-speech-recognition' or 'text-to-speech'
    max_results: maximum number of datasets to return
    max_pages: maximum number of pages to search per language code
    Returns tuple: (list of dataset dictionaries, log messages)
    """
    lang_info = LANGUAGES.get(language_code)
    logs = []

    if not lang_info:
        logs.append(f"No language info found for code: {language_code}")
        return [], logs

    # Collect all unique language codes for this language
    language_codes = set()
    if lang_info['alpha2']:
        language_codes.add(lang_info['alpha2'])  # 2-letter code
    if lang_info['alpha3_b']:
        language_codes.add(lang_info['alpha3_b'])  # 3-letter code
    if lang_info['alpha3_t']:
        language_codes.add(lang_info['alpha3_t'])  # 3-letter terminological

    logs.append(f"Language codes to search: {language_codes}")

    datasets = []
    seen_datasets = set()

    # Search separately for each language code
    for code in language_codes:
        if len(datasets) >= max_results:
            break

        logs.append(f"Searching for language code: {code}")

        for page in range(max_pages):
            if len(datasets) >= max_results:
                break

            try:
                # Use HuggingFace dataset search with correct format
                # Format: task_categories=task_categories:automatic-speech-recognition&language=language:en
                url = f"https://huggingface.co/datasets?task_categories=task_categories:{task_category}&language=language:{code}&sort=trending"
                if page > 0:
                    url += f"&p={page}"

                logs.append(f"  Page {page}: {url}")

                headers = {
                    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
                }

                response = requests.get(url, headers=headers, timeout=10)
                response.raise_for_status()

                soup = BeautifulSoup(response.content, 'html.parser')

                # Parse dataset cards from the page
                dataset_cards = soup.find_all('article', class_='overview-card-wrapper')

                if not dataset_cards:
                    logs.append(f"  No dataset cards found on page {page}")
                    break

                logs.append(f"  Found {len(dataset_cards)} dataset cards on page {page}")

                for card in dataset_cards:
                    if len(datasets) >= max_results:
                        break

                    try:
                        link = card.find('a', href=True)
                        if link:
                            href = link.get('href', '')
                            dataset_path = href.lstrip('/')

                            # Remove "datasets/" prefix if present
                            if dataset_path.startswith('datasets/'):
                                dataset_name = dataset_path[9:]  # Remove "datasets/" (9 chars)
                            else:
                                dataset_name = dataset_path

                            if dataset_name and dataset_name != '#' and dataset_name not in seen_datasets:
                                seen_datasets.add(dataset_name)

                                # Parse stats directly from the card HTML by looking at SVG icons
                                downloads = 0
                                likes = 0
                                size = ""

                                # Find all SVG elements in the card
                                svgs = card.find_all('svg')

                                for svg in svgs:
                                    # Get the next sibling text after the SVG
                                    # Could be direct text or text within a span/other element
                                    next_elem = svg.find_next_sibling(string=True)
                                    stat_text = ""

                                    if next_elem and next_elem.strip():
                                        stat_text = next_elem.strip()
                                    else:
                                        # Try to find text in the next sibling element (e.g., <span>)
                                        next_tag = svg.find_next_sibling()
                                        if next_tag:
                                            stat_text = next_tag.get_text(strip=True)

                                    # Skip non-numeric text like "Viewer", "Updated", etc.
                                    if not stat_text or len(stat_text) < 1 or stat_text in ['Viewer', 'Updated']:
                                        continue

                                    # Identify icon type by viewBox or path content
                                    svg_str = str(svg)

                                    # Download icon: viewBox="0 0 32 32" with download arrow path
                                    if 'M26 24v4H6v-4H4v4a2 2 0 0 0 2 2h20a2 2 0 0 0 2-2v-4zm0-10l-1.41-1.41L17 20.17V2h-2v18.17l-7.59-7.58L6 14l10 10l10-10z' in svg_str:
                                        downloads = parse_stat_number(stat_text)

                                    # Like/heart icon: heart path
                                    elif 'M22.45,6a5.47,5.47,0,0,1,3.91,1.64,5.7,5.7,0,0,1,0,8L16,26.13' in svg_str:
                                        likes = parse_stat_number(stat_text)

                                    # Dataset size icon: table/grid icon with fill-rule="evenodd"
                                    elif 'fill-rule="evenodd"' in svg_str and 'clip-rule="evenodd"' in svg_str:
                                        # Dataset size (e.g., "411k", "23.4M", "65.1k")
                                        # Must look like a number (has k, M, or digits)
                                        if any(c in stat_text for c in ['k', 'K', 'm', 'M']) or stat_text.replace(',', '').replace('.', '').isdigit():
                                            size = stat_text

                                datasets.append({
                                    'name': dataset_name,
                                    'url': f"https://huggingface.co/datasets/{dataset_name}",
                                    'downloads': downloads,
                                    'likes': likes,
                                    'size': size
                                })
                    except Exception as e:
                        logs.append(f"  Error parsing dataset card: {e}")
                        continue

            except Exception as e:
                logs.append(f"  ERROR searching page {page}: {e}")
                break

    # Sort by downloads (descending)
    datasets.sort(key=lambda x: x['downloads'], reverse=True)

    logs.append(f"Total unique datasets found: {len(datasets)}")
    return datasets, logs

def search_language_resources(language_code, deduplicate=False):
    """
    Search for ASR/TTS resources for a given language
    Returns results organized by service type
    deduplicate: if True, remove duplicate models (same base name) and keep only the one with most downloads
    """
    all_logs = []

    if not language_code:
        return None, None, None, 0, 0, None, None, 0, 0, ""

    lang_info = LANGUAGES.get(language_code)
    if not lang_info:
        return None, None, None, 0, 0, None, None, 0, 0, ""

    language_name = lang_info['name']
    all_logs.append(f"=== Searching for {language_name} ({language_code}) ===")
    all_logs.append(f"Language codes: alpha2={lang_info['alpha2']}, alpha3_b={lang_info['alpha3_b']}, alpha3_t={lang_info['alpha3_t']}")

    # Fetch Azure data
    all_logs.append("\n[Azure Speech Services]")
    azure_asr = fetch_azure_asr_languages()
    azure_tts = fetch_azure_tts_languages()
    all_logs.append(f"  Fetched {len(azure_asr)} ASR languages and {len(azure_tts)} TTS languages from Azure")

    # Get matching Azure locales using alpha2 code
    azure_locales = get_azure_locales_for_language(language_code)
    all_logs.append(f"  Matching Azure locales: {azure_locales}")

    # Check Azure ASR support
    azure_asr_locales = [loc for loc in azure_locales if loc in azure_asr]
    azure_asr_available = len(azure_asr_locales) > 0
    all_logs.append(f"  Azure ASR: {'✅ Supported' if azure_asr_available else '❌ Not supported'} ({len(azure_asr_locales)} locales)")

    # Check Azure TTS support and count voices
    azure_tts_locales = [loc for loc in azure_locales if loc in azure_tts]
    azure_tts_available = len(azure_tts_locales) > 0
    azure_total_voices = sum(azure_tts[loc]['voice_count'] for loc in azure_tts_locales)
    all_logs.append(f"  Azure TTS: {'✅ Supported' if azure_tts_available else '❌ Not supported'} ({len(azure_tts_locales)} locales, {azure_total_voices} voices)")

    # Fetch Google Cloud data
    all_logs.append("\n[Google Cloud Speech]")
    google_stt = fetch_google_stt_languages()
    google_tts = fetch_google_tts_languages()
    all_logs.append(f"  Fetched {len(google_stt)} STT languages and {len(google_tts)} TTS languages from Google Cloud")

    # Get matching Google Cloud locales using alpha2 code
    google_locales = get_google_locales_for_language(language_code)
    all_logs.append(f"  Matching Google Cloud locales: {google_locales}")

    # Check Google Cloud STT support
    google_stt_locales = [loc for loc in google_locales if loc in google_stt]
    google_stt_available = len(google_stt_locales) > 0
    all_logs.append(f"  Google STT: {'✅ Supported' if google_stt_available else '❌ Not supported'} ({len(google_stt_locales)} locales)")

    # Check Google Cloud TTS support and count voices
    google_tts_locales = [loc for loc in google_locales if loc in google_tts]
    google_tts_available = len(google_tts_locales) > 0
    google_total_voices = sum(google_tts[loc]['voice_count'] for loc in google_tts_locales)
    all_logs.append(f"  Google TTS: {'✅ Supported' if google_tts_available else '❌ Not supported'} ({len(google_tts_locales)} locales, {google_total_voices} voices)")

    # Fetch AWS data
    all_logs.append("\n[AWS (Transcribe + Polly)]")
    aws_transcribe = fetch_aws_transcribe_languages()
    aws_polly = fetch_aws_polly_languages()
    all_logs.append(f"  Fetched {len(aws_transcribe)} Transcribe languages and {len(aws_polly)} Polly languages from AWS")

    # Get matching AWS locales using alpha2 code
    aws_locales = get_aws_locales_for_language(language_code)
    all_logs.append(f"  Matching AWS locales: {aws_locales}")

    # Check AWS Transcribe support
    aws_transcribe_locales = [loc for loc in aws_locales if loc in aws_transcribe]
    aws_transcribe_available = len(aws_transcribe_locales) > 0
    all_logs.append(f"  AWS Transcribe: {'✅ Supported' if aws_transcribe_available else '❌ Not supported'} ({len(aws_transcribe_locales)} locales)")

    # Check AWS Polly support and count voices
    aws_polly_locales = [loc for loc in aws_locales if loc in aws_polly]
    aws_polly_available = len(aws_polly_locales) > 0
    aws_total_voices = sum(aws_polly[loc]['voice_count'] for loc in aws_polly_locales)
    all_logs.append(f"  AWS Polly: {'✅ Supported' if aws_polly_available else '❌ Not supported'} ({len(aws_polly_locales)} locales, {aws_total_voices} voices)")

    # Commercial Services
    commercial_rows = []

    # Azure Speech
    if azure_asr_available:
        azure_asr_text = f"✅ {len(azure_asr_locales)} locale(s)"
    else:
        azure_asr_text = "❌ N/A"

    if azure_tts_available:
        azure_tts_text = f"✅ {len(azure_tts_locales)} locale(s), {azure_total_voices} voice(s)"
    else:
        azure_tts_text = "❌ N/A"

    commercial_rows.append({
        "Service": "Azure Speech",
        "ASR": azure_asr_text,
        "TTS": azure_tts_text,
    })

    # Google Cloud Speech
    if google_stt_available:
        google_stt_text = f"✅ {len(google_stt_locales)} locale(s)"
    else:
        google_stt_text = "❌ N/A"

    if google_tts_available:
        google_tts_text = f"✅ {len(google_tts_locales)} locale(s), {google_total_voices} voice(s)"
    else:
        google_tts_text = "❌ N/A"

    commercial_rows.append({
        "Service": "Google Cloud Speech",
        "ASR": google_stt_text,
        "TTS": google_tts_text,
    })

    # AWS (Transcribe + Polly)
    if aws_transcribe_available:
        aws_transcribe_text = f"✅ {len(aws_transcribe_locales)} locale(s)"
    else:
        aws_transcribe_text = "❌ N/A"

    if aws_polly_available:
        aws_polly_text = f"✅ {len(aws_polly_locales)} locale(s), {aws_total_voices} voice(s)"
    else:
        aws_polly_text = "❌ N/A"

    commercial_rows.append({
        "Service": "AWS (Transcribe + Polly)",
        "ASR": aws_transcribe_text,
        "TTS": aws_polly_text,
    })

    # ElevenLabs Multilingual v2 (TTS only)
    all_logs.append("\n[ElevenLabs]")
    elevenlabs_v2_supported = check_elevenlabs_multilingual_v2_support(language_code)
    all_logs.append(f"  Multilingual v2: {'✅ Supported' if elevenlabs_v2_supported else '❌ Not supported'}")

    if elevenlabs_v2_supported:
        elevenlabs_v2_tts_text = "✅ Supported"
    else:
        elevenlabs_v2_tts_text = "❌ N/A"

    commercial_rows.append({
        "Service": "ElevenLabs Multilingual v2",
        "ASR": "N/A",  # ElevenLabs doesn't offer ASR
        "TTS": elevenlabs_v2_tts_text,
    })

    # ElevenLabs Turbo v3 (TTS only)
    elevenlabs_v3_supported = check_elevenlabs_turbo_v3_support(language_code)
    all_logs.append(f"  Turbo v3: {'✅ Supported' if elevenlabs_v3_supported else '❌ Not supported'}")

    if elevenlabs_v3_supported:
        elevenlabs_v3_tts_text = "✅ Supported"
    else:
        elevenlabs_v3_tts_text = "❌ N/A"

    commercial_rows.append({
        "Service": "ElevenLabs Turbo v3",
        "ASR": "N/A",  # ElevenLabs doesn't offer ASR
        "TTS": elevenlabs_v3_tts_text,
    })

    commercial_df = pd.DataFrame(commercial_rows)

    # HuggingFace Models - Search for real ASR and TTS models
    all_logs.append("\n[HuggingFace Models]")

    asr_models, asr_model_logs = search_huggingface_models(language_code, 'automatic-speech-recognition', max_results=100, max_pages=5)
    all_logs.extend([f"  [ASR] {log}" for log in asr_model_logs])

    tts_models, tts_model_logs = search_huggingface_models(language_code, 'text-to-speech', max_results=100, max_pages=5)
    all_logs.extend([f"  [TTS] {log}" for log in tts_model_logs])

    # Apply deduplication if requested
    if deduplicate:
        all_logs.append(f"\n[Deduplication]")
        asr_before = len(asr_models)
        asr_models = deduplicate_models(asr_models)
        all_logs.append(f"  ASR models: {asr_before} → {len(asr_models)} (removed {asr_before - len(asr_models)} duplicates)")

        tts_before = len(tts_models)
        tts_models = deduplicate_models(tts_models)
        all_logs.append(f"  TTS models: {tts_before} → {len(tts_models)} (removed {tts_before - len(tts_models)} duplicates)")
    else:
        # Add duplicates count of 1 for all models when not deduplicating
        for model in asr_models:
            model['duplicates'] = 1
        for model in tts_models:
            model['duplicates'] = 1

    # Format ASR models with clickable names
    asr_models_data = []
    for model in asr_models:
        asr_models_data.append({
            "Model Name": f"[{model['name']}]({model['url']})",
            "Downloads": model['downloads'],
            "Likes": model['likes'],
            "Size": model.get('size', ''),
            "Duplicates": model.get('duplicates', 1)
        })

    if asr_models_data:
        asr_models_df = pd.DataFrame(asr_models_data)
    else:
        # Empty dataframe if no models found
        asr_models_df = pd.DataFrame(columns=["Model Name", "Downloads", "Likes", "Size", "Duplicates"])

    # Format TTS models with clickable names
    tts_models_data = []
    for model in tts_models:
        tts_models_data.append({
            "Model Name": f"[{model['name']}]({model['url']})",
            "Downloads": model['downloads'],
            "Likes": model['likes'],
            "Size": model.get('size', ''),
            "Duplicates": model.get('duplicates', 1)
        })

    if tts_models_data:
        tts_models_df = pd.DataFrame(tts_models_data)
    else:
        # Empty dataframe if no models found
        tts_models_df = pd.DataFrame(columns=["Model Name", "Downloads", "Likes", "Size", "Duplicates"])

    # HuggingFace Datasets - Search for real ASR and TTS datasets
    all_logs.append("\n[HuggingFace Datasets]")
    asr_datasets, asr_dataset_logs = search_huggingface_datasets(language_code, 'automatic-speech-recognition', max_results=100, max_pages=5)
    all_logs.extend([f"  [ASR] {log}" for log in asr_dataset_logs])

    tts_datasets, tts_dataset_logs = search_huggingface_datasets(language_code, 'text-to-speech', max_results=100, max_pages=5)
    all_logs.extend([f"  [TTS] {log}" for log in tts_dataset_logs])

    # Format ASR datasets with clickable names
    asr_datasets_data = []
    for dataset in asr_datasets:
        asr_datasets_data.append({
            "Dataset Name": f"[{dataset['name']}]({dataset['url']})",
            "Downloads": dataset['downloads'],
            "Likes": dataset['likes'],
            "Size": dataset.get('size', '')
        })

    if asr_datasets_data:
        asr_datasets_df = pd.DataFrame(asr_datasets_data)
    else:
        # Empty dataframe if no datasets found
        asr_datasets_df = pd.DataFrame(columns=["Dataset Name", "Downloads", "Likes", "Size"])

    # Format TTS datasets with clickable names
    tts_datasets_data = []
    for dataset in tts_datasets:
        tts_datasets_data.append({
            "Dataset Name": f"[{dataset['name']}]({dataset['url']})",
            "Downloads": dataset['downloads'],
            "Likes": dataset['likes'],
            "Size": dataset.get('size', '')
        })

    if tts_datasets_data:
        tts_datasets_df = pd.DataFrame(tts_datasets_data)
    else:
        # Empty dataframe if no datasets found
        tts_datasets_df = pd.DataFrame(columns=["Dataset Name", "Downloads", "Likes", "Size"])

    # Combine all logs
    log_text = "\n".join(all_logs)

    # Return separate ASR and TTS dataframes, plus counts for tab labels, plus logs
    return commercial_df, asr_models_df, tts_models_df, len(asr_models), len(tts_models), asr_datasets_df, tts_datasets_df, len(asr_datasets), len(tts_datasets), log_text

# Initialize - load language list and app content
print("Initializing Speech Resource Finder...")
load_app_content()
load_language_list()
load_language_taxonomy()

# Create language choices for dropdown (code: name format for easy searching)
language_choices = [f"{code}: {info['name']}" for code, info in sorted(LANGUAGES.items(), key=lambda x: x[1]['name'])]
print(f"Created dropdown with {len(language_choices)} language options")

with gr.Blocks(title=APP_CONTENT["title"]) as demo:
    gr.Markdown(f"# {APP_CONTENT['title']}")
    gr.Markdown(APP_CONTENT["description"])

    with gr.Row(equal_height=True):
        with gr.Column(scale=2):
            language_dropdown = gr.Dropdown(
                choices=language_choices,
                label="Select Language",
                info="Type to search for a language",
                allow_custom_value=False,
                filterable=True,
            )
        with gr.Column(scale=1):
            language_metadata = gr.HTML(
                """<div style='padding: 15px; border: 2px solid #e0e0e0; border-radius: 4px; background-color: #fafafa; height: 100%; display: flex; align-items: center; justify-content: center; box-sizing: border-box;'>
                <p style='margin: 0; color: #333; font-size: 14px;'>Select a language to see resource classification</p>
                </div>""",
                elem_id="language-metadata"
            )

    gr.Markdown("## Commercial Services")
    commercial_table = gr.Dataframe(
        headers=["Service", "ASR", "TTS"],
        interactive=False,
        wrap=True,
    )

    gr.Markdown("## HuggingFace Models")

    with gr.Row():
        deduplicate_checkbox = gr.Checkbox(
            label="Deduplicate models",
            value=True,
            info="Keep only the model with most downloads for each base name"
        )

    # Create tabs for ASR and TTS models with count labels
    with gr.Tabs():
        with gr.Tab(label="ASR Models") as asr_tab:
            asr_count_label = gr.Markdown("*Loading...*")
            asr_models_table = gr.Dataframe(
                headers=["Model Name", "Downloads", "Likes", "Size", "Duplicates"],
                interactive=False,
                wrap=True,
                datatype=["markdown", "number", "number", "str", "number"],
            )

        with gr.Tab(label="TTS Models") as tts_tab:
            tts_count_label = gr.Markdown("*Loading...*")
            tts_models_table = gr.Dataframe(
                headers=["Model Name", "Downloads", "Likes", "Size", "Duplicates"],
                interactive=False,
                wrap=True,
                datatype=["markdown", "number", "number", "str", "number"],
            )

    gr.Markdown("## HuggingFace Datasets")

    # Create tabs for ASR and TTS datasets with count labels
    with gr.Tabs():
        with gr.Tab(label="ASR Datasets") as asr_datasets_tab:
            asr_datasets_count_label = gr.Markdown("*Loading...*")
            asr_datasets_table = gr.Dataframe(
                headers=["Dataset Name", "Downloads", "Likes", "Size"],
                interactive=False,
                wrap=True,
                datatype=["markdown", "number", "number", "str"],
            )

        with gr.Tab(label="TTS Datasets") as tts_datasets_tab:
            tts_datasets_count_label = gr.Markdown("*Loading...*")
            tts_datasets_table = gr.Dataframe(
                headers=["Dataset Name", "Downloads", "Likes", "Size"],
                interactive=False,
                wrap=True,
                datatype=["markdown", "number", "number", "str"],
            )

    with gr.Accordion("Logs", open=False):
        log_textbox = gr.Textbox(
            show_label=False,
            lines=15,
            max_lines=30,
            interactive=False,
            placeholder="Logs will appear here...",
            autoscroll=True,
        )

    # About section with full content
    with gr.Accordion("About this tool", open=False):
        gr.Markdown(APP_CONTENT["full_content"])

    def on_search(language_selection, deduplicate):
        if not language_selection:
            default_html = """<div style='padding: 15px; border: 2px solid #e0e0e0; border-radius: 4px; background-color: #fafafa; height: 100%; display: flex; align-items: center; justify-content: center; box-sizing: border-box;'>
            <p style='margin: 0; color: #333; font-size: 14px;'>Select a language to see resource classification</p>
            </div>"""
            return default_html, None, "", None, "", None, "", None, "", None, ""
        # Extract the language code from "code: name" format
        language_code = language_selection.split(":")[0].strip()

        # Get language name for taxonomy lookup
        language_name = LANGUAGES.get(language_code, {}).get("name", "")

        # Get taxonomy classification
        level, classification = get_language_taxonomy_info(language_name)

        # Create metadata display with color coding
        if level is not None:
            color = get_taxonomy_color(level)
            metadata_html = f"""<div style='padding: 15px; border: 2px solid {color}; border-radius: 4px; background-color: #ffffff; height: 100%; box-sizing: border-box; display: flex; flex-direction: column; justify-content: center;'>
            <h4 style='margin: 0 0 8px 0; color: #333; font-size: 16px;'>{language_name}</h4>
            <div style='margin: 8px 0;'>
                <span style='padding: 6px 12px; background-color: {color}; color: white; border-radius: 4px; font-weight: bold; font-size: 13px;'>{classification}</span>
            </div>
            <p style='margin: 8px 0 0 0; font-size: 11px; color: #555;'>Source: <a href='https://microsoft.github.io/linguisticdiversity/' target='_blank' style='color: #1976d2; text-decoration: none;'>Joshi et al.</a></p>
            </div>"""
        else:
            metadata_html = f"""<div style='padding: 15px; border: 2px solid #757575; border-radius: 4px; background-color: #ffffff; height: 100%; box-sizing: border-box; display: flex; flex-direction: column; justify-content: center;'>
            <h4 style='margin: 0 0 8px 0; color: #333; font-size: 16px;'>{language_name}</h4>
            <div style='margin: 8px 0;'>
                <span style='padding: 6px 12px; background-color: #757575; color: white; border-radius: 4px; font-weight: bold; font-size: 13px;'>Unknown</span>
            </div>
            <p style='margin: 8px 0 0 0; font-size: 11px; color: #555;'>Source: <a href='https://microsoft.github.io/linguisticdiversity/' target='_blank' style='color: #1976d2; text-decoration: none;'>Joshi et al.</a></p>
            </div>"""

        commercial_df, asr_models_df, tts_models_df, asr_models_count, tts_models_count, asr_datasets_df, tts_datasets_df, asr_datasets_count, tts_datasets_count, logs = search_language_resources(language_code, deduplicate=deduplicate)

        # Create count labels
        asr_models_label = f"**Found {asr_models_count} ASR model(s)**"
        tts_models_label = f"**Found {tts_models_count} TTS model(s)**"
        asr_datasets_label = f"**Found {asr_datasets_count} ASR dataset(s)**"
        tts_datasets_label = f"**Found {tts_datasets_count} TTS dataset(s)**"

        return metadata_html, commercial_df, asr_models_label, asr_models_df, tts_models_label, tts_models_df, asr_datasets_label, asr_datasets_df, tts_datasets_label, tts_datasets_df, logs

    # Trigger search when language is selected
    language_dropdown.change(
        fn=on_search,
        inputs=[language_dropdown, deduplicate_checkbox],
        outputs=[language_metadata, commercial_table, asr_count_label, asr_models_table, tts_count_label, tts_models_table, asr_datasets_count_label, asr_datasets_table, tts_datasets_count_label, tts_datasets_table, log_textbox],
    )

    # Trigger search when deduplicate checkbox is changed
    deduplicate_checkbox.change(
        fn=on_search,
        inputs=[language_dropdown, deduplicate_checkbox],
        outputs=[language_metadata, commercial_table, asr_count_label, asr_models_table, tts_count_label, tts_models_table, asr_datasets_count_label, asr_datasets_table, tts_datasets_count_label, tts_datasets_table, log_textbox],
    )

if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7860, share=False, show_error=True)