Spaces:

CLEAR-Global
/

speech-resource-finder

Running

Alp

logs scroll fix

1fd7b36 about 1 month ago

58.8 kB

	import gradio as gr
	import pandas as pd
	import requests
	from bs4 import BeautifulSoup
	from functools import lru_cache
	import csv
	from io import StringIO
	import re

	# Configuration
	LANGUAGE_CODES_FILE = "language-codes-full.csv"
	APP_CONTENT_FILE = "app_content.md"
	LANGUAGE_TAXONOMY_URL = "https://microsoft.github.io/linguisticdiversity/assets/lang2tax.txt"

	# Language list will be loaded from CSV
	# Structure: {alpha3_b: {"name": str, "alpha3_t": str, "alpha2": str}}
	LANGUAGES = {}

	# Language taxonomy mapping (from Joshi et al.'s linguistic diversity paper)
	# Structure: {language_name_lowercase: level}
	LANGUAGE_TAXONOMY = {}

	# Taxonomy level descriptions
	TAXONOMY_LEVELS = {
	0: "The Left-Behinds",
	1: "The Scraping-Bys",
	2: "The Hopefuls",
	3: "The Rising Stars",
	4: "The Underdogs",
	5: "The Winners"
	}

	# App content will be loaded from markdown file
	APP_CONTENT = {
	"title": "Speech Resource Finder",
	"description": "Search for speech resources",
	"full_content": ""
	}

	def load_app_content(content_path=None):
	"""Load app content from markdown file"""
	global APP_CONTENT
	if content_path is None:
	content_path = APP_CONTENT_FILE

	try:
	with open(content_path, 'r', encoding='utf-8') as f:
	content = f.read()

	# Parse markdown content
	lines = content.split('\n')

	# Extract title (first # heading)
	title = "Speech Resource Finder"
	for line in lines:
	if line.startswith('# '):
	title = line[2:].strip()
	break

	# Extract description (text after ## Description until next ##)
	description = ""
	in_description = False
	for line in lines:
	if line.startswith('## Description'):
	in_description = True
	continue
	elif in_description and line.startswith('##'):
	break
	elif in_description and line.strip():
	description += line.strip() + " "

	APP_CONTENT = {
	"title": title,
	"description": description.strip(),
	"full_content": content
	}
	print(f"Loaded app content from {content_path}")
	except Exception as e:
	print(f"Error loading app content: {e}")
	print("Using default content")

	def load_language_list(csv_path=None):
	"""Load ISO 639 language codes from CSV file"""
	global LANGUAGES
	if csv_path is None:
	csv_path = LANGUAGE_CODES_FILE

	try:
	with open(csv_path, 'r', encoding='utf-8') as f:
	reader = csv.DictReader(f)
	for row in reader:
	# Use alpha3-b as primary key, fallback to alpha3-t if empty
	code_b = row['alpha3-b'].strip()
	code_t = row['alpha3-t'].strip()
	code_2 = row['alpha2'].strip()
	name = row['English'].strip()

	primary_code = code_b if code_b else code_t

	if primary_code and name:
	LANGUAGES[primary_code] = {
	"name": name,
	"alpha3_b": code_b,
	"alpha3_t": code_t,
	"alpha2": code_2
	}
	print(f"Loaded {len(LANGUAGES)} languages from {csv_path}")
	except Exception as e:
	print(f"Error loading language list: {e}")
	# Fallback to a minimal set
	LANGUAGES = {
	"eng": {"name": "English", "alpha3_b": "eng", "alpha3_t": "", "alpha2": "en"},
	"spa": {"name": "Spanish", "alpha3_b": "spa", "alpha3_t": "", "alpha2": "es"},
	"fra": {"name": "French", "alpha3_b": "fra", "alpha3_t": "", "alpha2": "fr"},
	"deu": {"name": "German", "alpha3_b": "ger", "alpha3_t": "deu", "alpha2": "de"},
	}
	print(f"Using fallback with {len(LANGUAGES)} languages")

	def load_language_taxonomy():
	"""Load language taxonomy data from Microsoft's linguistic diversity project"""
	global LANGUAGE_TAXONOMY

	try:
	response = requests.get(LANGUAGE_TAXONOMY_URL, timeout=10)
	response.raise_for_status()

	# Parse the CSV-like content (format: language_name,level)
	for line in response.text.strip().split('\n'):
	if line.strip():
	parts = line.strip().split(',')
	if len(parts) == 2:
	lang_name = parts[0].strip().lower()
	level = int(parts[1].strip())
	LANGUAGE_TAXONOMY[lang_name] = level

	print(f"Loaded taxonomy data for {len(LANGUAGE_TAXONOMY)} languages")
	except Exception as e:
	print(f"Warning: Could not load language taxonomy: {e}")
	print("Language classification will show as 'Unknown'")

	def get_taxonomy_color(level):
	"""
	Get color code for taxonomy level (red for left-behind, green for winners)
	"""
	colors = {
	0: "#d32f2f", # Red - The Left-Behinds
	1: "#f57c00", # Orange - The Scraping-Bys
	2: "#fbc02d", # Yellow - The Hopefuls
	3: "#afb42b", # Yellow-green - The Rising Stars
	4: "#7cb342", # Light green - The Underdogs
	5: "#388e3c", # Green - The Winners
	}
	return colors.get(level, "#757575") # Gray for unknown

	def get_language_taxonomy_info(language_name):
	"""
	Get taxonomy classification for a language.
	Returns a tuple of (level, description) or (None, "Unknown")
	"""
	if not language_name:
	return None, "Unknown"

	# Try exact match (case-insensitive)
	lang_lower = language_name.lower()
	if lang_lower in LANGUAGE_TAXONOMY:
	level = LANGUAGE_TAXONOMY[lang_lower]
	return level, TAXONOMY_LEVELS.get(level, f"Level {level}")

	# Try with semicolon-separated alternative names (e.g., "Catalan; Valencian")
	if ';' in lang_lower:
	parts = [p.strip() for p in lang_lower.split(';')]
	for part in parts:
	if part in LANGUAGE_TAXONOMY:
	level = LANGUAGE_TAXONOMY[part]
	return level, TAXONOMY_LEVELS.get(level, f"Level {level}")

	# Try with comma-separated variations (e.g., "Chinese, Mandarin")
	if ',' in lang_lower:
	parts = [p.strip() for p in lang_lower.split(',')]
	for part in parts:
	if part in LANGUAGE_TAXONOMY:
	level = LANGUAGE_TAXONOMY[part]
	return level, TAXONOMY_LEVELS.get(level, f"Level {level}")

	return None, "Unknown"

	@lru_cache(maxsize=1)
	def fetch_azure_asr_languages():
	"""Scrape Azure Speech-to-Text supported languages"""
	url = "https://learn.microsoft.com/en-us/azure/ai-services/speech-service/language-support?tabs=stt"

	try:
	response = requests.get(url, timeout=10)
	response.raise_for_status()
	soup = BeautifulSoup(response.content, 'html.parser')

	# Find the table with locale data
	# The table has columns: Locale (BCP-47) \| Language \| Fast transcription support \| Custom speech support
	tables = soup.find_all('table')

	azure_asr = {}
	for table in tables:
	rows = table.find_all('tr')
	if not rows:
	continue

	# Check if this is the right table by looking at headers
	headers = [th.get_text(strip=True) for th in rows[0].find_all('th')]
	if 'Locale' in ' '.join(headers) or 'Language' in ' '.join(headers):
	for row in rows[1:]: # Skip header
	cols = row.find_all('td')
	if len(cols) >= 2:
	locale = cols[0].get_text(strip=True)
	language = cols[1].get_text(strip=True)
	if locale and language:
	azure_asr[locale] = language
	break

	return azure_asr
	except Exception as e:
	print(f"Error fetching Azure ASR data: {e}")
	return {}

	@lru_cache(maxsize=1)
	def fetch_azure_tts_languages():
	"""Scrape Azure Text-to-Speech supported languages with voice counts"""
	url = "https://learn.microsoft.com/en-us/azure/ai-services/speech-service/language-support?tabs=tts"

	try:
	response = requests.get(url, timeout=10)
	response.raise_for_status()
	soup = BeautifulSoup(response.content, 'html.parser')

	# Find the TTS table
	# Columns: Locale (BCP-47) \| Language \| Text to speech voices
	tables = soup.find_all('table')

	azure_tts = {}
	for table in tables:
	rows = table.find_all('tr')
	if not rows:
	continue

	headers = [th.get_text(strip=True) for th in rows[0].find_all('th')]
	if 'Text to speech' in ' '.join(headers) or 'voices' in ' '.join(headers).lower():
	for row in rows[1:]:
	cols = row.find_all('td')
	if len(cols) >= 3:
	locale = cols[0].get_text(strip=True)
	language = cols[1].get_text(strip=True)
	voices_text = cols[2].get_text(strip=True)
	# Count number of voices (look for "Neural" in the text)
	voice_count = voices_text.count('Neural')
	if locale and language:
	azure_tts[locale] = {
	'language': language,
	'voice_count': voice_count
	}
	break

	return azure_tts
	except Exception as e:
	print(f"Error fetching Azure TTS data: {e}")
	return {}

	@lru_cache(maxsize=1)
	def fetch_google_stt_languages():
	"""Scrape Google Cloud Speech-to-Text supported languages"""
	url = "https://cloud.google.com/speech-to-text/docs/speech-to-text-supported-languages"

	try:
	response = requests.get(url, timeout=10)
	response.raise_for_status()
	soup = BeautifulSoup(response.content, 'html.parser')

	# Find tables with BCP-47 language codes
	tables = soup.find_all('table')

	google_stt = {}
	for table in tables:
	rows = table.find_all('tr')
	if not rows:
	continue

	# Check if this table has BCP-47 column
	headers = [th.get_text(strip=True) for th in rows[0].find_all('th')]

	# Find BCP-47 column index
	bcp47_idx = None
	name_idx = None
	for idx, header in enumerate(headers):
	if 'BCP-47' in header or 'BCP47' in header:
	bcp47_idx = idx
	if 'Name' in header and name_idx is None:
	name_idx = idx

	if bcp47_idx is not None:
	for row in rows[1:]: # Skip header
	cols = row.find_all('td')
	if len(cols) > bcp47_idx:
	locale = cols[bcp47_idx].get_text(strip=True)
	language = cols[name_idx].get_text(strip=True) if name_idx and len(cols) > name_idx else ''
	if locale and locale not in ['—', '-', '']:
	google_stt[locale] = language

	return google_stt
	except Exception as e:
	print(f"Error fetching Google STT data: {e}")
	return {}

	@lru_cache(maxsize=1)
	def fetch_google_tts_languages():
	"""Scrape Google Cloud Text-to-Speech supported languages with voice counts"""
	url = "https://cloud.google.com/text-to-speech/docs/voices"

	try:
	response = requests.get(url, timeout=10)
	response.raise_for_status()
	soup = BeautifulSoup(response.content, 'html.parser')

	# Find the voices table
	# Columns: Language \| Voice type \| Language code \| Voice name \| SSML Gender \| Sample
	tables = soup.find_all('table')

	google_tts = {}
	for table in tables:
	rows = table.find_all('tr')
	if not rows:
	continue

	headers = [th.get_text(strip=True) for th in rows[0].find_all('th')]

	# Find Language code column index
	lang_code_idx = None
	for idx, header in enumerate(headers):
	if 'Language code' in header or 'language code' in header.lower():
	lang_code_idx = idx
	break

	if lang_code_idx is not None:
	for row in rows[1:]:
	cols = row.find_all('td')
	if len(cols) > lang_code_idx:
	locale = cols[lang_code_idx].get_text(strip=True)
	if locale and locale not in ['—', '-', '']:
	# Count voices per locale
	if locale in google_tts:
	google_tts[locale]['voice_count'] += 1
	else:
	language = cols[0].get_text(strip=True) if len(cols) > 0 else ''
	google_tts[locale] = {
	'language': language,
	'voice_count': 1
	}

	return google_tts
	except Exception as e:
	print(f"Error fetching Google TTS data: {e}")
	return {}

	@lru_cache(maxsize=1)
	def fetch_elevenlabs_multilingual_v2():
	"""Get ElevenLabs Multilingual v2 supported languages"""
	# Based on https://elevenlabs.io/docs/models#multilingual-v2
	# These are ISO 639-1 (2-letter) codes
	supported_codes = {
	'en', 'ja', 'zh', 'de', 'hi', 'fr', 'ko', 'pt', 'it', 'es',
	'id', 'nl', 'tr', 'fil', 'pl', 'sv', 'bg', 'ro', 'ar', 'cs',
	'el', 'fi', 'hr', 'ms', 'sk', 'da', 'ta', 'uk', 'ru'
	}
	return supported_codes

	@lru_cache(maxsize=1)
	def fetch_elevenlabs_turbo_v3():
	"""Get ElevenLabs Eleven Turbo v3 (formerly v3 Alpha) supported languages"""
	# Based on https://elevenlabs.io/docs/models#eleven-v3-alpha
	# These are ISO 639-3 (3-letter) codes
	supported_codes = {
	'afr', 'ara', 'hye', 'asm', 'aze', 'bel', 'ben', 'bos', 'bul', 'cat',
	'ceb', 'nya', 'hrv', 'ces', 'dan', 'nld', 'eng', 'est', 'fil', 'fin',
	'fra', 'glg', 'kat', 'deu', 'ell', 'guj', 'hau', 'heb', 'hin', 'hun',
	'isl', 'ind', 'gle', 'ita', 'jpn', 'jav', 'kan', 'kaz', 'kir', 'kor',
	'lav', 'lin', 'lit', 'ltz', 'mkd', 'msa', 'mal', 'cmn', 'mar', 'nep',
	'nor', 'pus', 'fas', 'pol', 'por', 'pan', 'ron', 'rus', 'srp', 'snd',
	'slk', 'slv', 'som', 'spa', 'swa', 'swe', 'tam', 'tel', 'tha', 'tur',
	'ukr', 'urd', 'vie', 'cym'
	}
	return supported_codes

	@lru_cache(maxsize=1)
	def fetch_aws_transcribe_languages():
	"""Scrape AWS Transcribe (ASR) supported languages"""
	url = "https://docs.aws.amazon.com/transcribe/latest/dg/supported-languages.html"

	try:
	response = requests.get(url, timeout=10)
	response.raise_for_status()
	soup = BeautifulSoup(response.content, 'html.parser')

	# Find tables with language codes
	tables = soup.find_all('table')

	aws_transcribe = {}
	for table in tables:
	rows = table.find_all('tr')
	if not rows:
	continue

	# Check if this table has language code column
	headers = [th.get_text(strip=True) for th in rows[0].find_all('th')]

	# Find language code column index
	lang_code_idx = None
	lang_name_idx = None
	for idx, header in enumerate(headers):
	if 'Language code' in header or 'language code' in header.lower():
	lang_code_idx = idx
	if 'Language' == header or header.startswith('Language'):
	lang_name_idx = idx

	if lang_code_idx is not None:
	for row in rows[1:]: # Skip header
	cols = row.find_all('td')
	if len(cols) > lang_code_idx:
	locale = cols[lang_code_idx].get_text(strip=True)
	language = cols[lang_name_idx].get_text(strip=True) if lang_name_idx and len(cols) > lang_name_idx else ''
	if locale and locale not in ['—', '-', '']:
	aws_transcribe[locale] = language

	return aws_transcribe
	except Exception as e:
	print(f"Error fetching AWS Transcribe data: {e}")
	return {}

	@lru_cache(maxsize=1)
	def fetch_aws_polly_languages():
	"""Scrape AWS Polly (TTS) supported languages"""
	url = "https://docs.aws.amazon.com/polly/latest/dg/supported-languages.html"

	try:
	response = requests.get(url, timeout=10)
	response.raise_for_status()
	soup = BeautifulSoup(response.content, 'html.parser')

	# Find tables with language codes
	tables = soup.find_all('table')

	aws_polly = {}
	for table in tables:
	rows = table.find_all('tr')
	if not rows:
	continue

	# Check if this table has language code column
	headers = [th.get_text(strip=True) for th in rows[0].find_all('th')]

	# Find language code column index
	lang_code_idx = None
	lang_name_idx = None
	for idx, header in enumerate(headers):
	if 'Language code' in header or 'language code' in header.lower():
	lang_code_idx = idx
	if 'Language' == header or header.startswith('Language'):
	lang_name_idx = idx

	if lang_code_idx is not None:
	for row in rows[1:]: # Skip header
	cols = row.find_all('td')
	if len(cols) > lang_code_idx:
	locale = cols[lang_code_idx].get_text(strip=True)
	language = cols[lang_name_idx].get_text(strip=True) if lang_name_idx and len(cols) > lang_name_idx else ''
	if locale and locale not in ['—', '-', '']:
	# Count voices per locale (each row is a different voice/locale combo)
	if locale in aws_polly:
	aws_polly[locale]['voice_count'] += 1
	else:
	aws_polly[locale] = {
	'language': language,
	'voice_count': 1
	}

	return aws_polly
	except Exception as e:
	print(f"Error fetching AWS Polly data: {e}")
	return {}

	def get_azure_locales_for_language(language_code):
	"""
	Get Azure BCP-47 locales for a language using its alpha2 code
	Returns list of matching locales from Azure
	"""
	lang_info = LANGUAGES.get(language_code)
	if not lang_info or not lang_info['alpha2']:
	return []

	alpha2 = lang_info['alpha2']
	azure_asr = fetch_azure_asr_languages()
	azure_tts = fetch_azure_tts_languages()

	# Find all locales that start with the alpha2 code
	matching_locales = set()

	for locale in azure_asr.keys():
	if locale.startswith(alpha2 + '-') or locale == alpha2:
	matching_locales.add(locale)

	for locale in azure_tts.keys():
	if locale.startswith(alpha2 + '-') or locale == alpha2:
	matching_locales.add(locale)

	return sorted(matching_locales)

	def get_google_locales_for_language(language_code):
	"""
	Get Google Cloud BCP-47 locales for a language using its alpha2 code
	Returns list of matching locales from Google Cloud
	"""
	lang_info = LANGUAGES.get(language_code)
	if not lang_info or not lang_info['alpha2']:
	return []

	alpha2 = lang_info['alpha2']
	google_stt = fetch_google_stt_languages()
	google_tts = fetch_google_tts_languages()

	# Find all locales that start with the alpha2 code
	matching_locales = set()

	for locale in google_stt.keys():
	if locale.startswith(alpha2 + '-') or locale == alpha2:
	matching_locales.add(locale)

	for locale in google_tts.keys():
	if locale.startswith(alpha2 + '-') or locale == alpha2:
	matching_locales.add(locale)

	return sorted(matching_locales)

	def check_elevenlabs_multilingual_v2_support(language_code):
	"""
	Check if ElevenLabs Multilingual v2 supports a language using ISO 639-1 (alpha2) codes
	Returns True if supported, False otherwise
	"""
	lang_info = LANGUAGES.get(language_code)
	if not lang_info:
	return False

	supported_codes = fetch_elevenlabs_multilingual_v2()

	# Check alpha2 code (2-letter code)
	if lang_info['alpha2'] and lang_info['alpha2'] in supported_codes:
	return True

	return False

	def check_elevenlabs_turbo_v3_support(language_code):
	"""
	Check if ElevenLabs Turbo v3 supports a language using ISO 639-3 (alpha3) codes
	Returns True if supported, False otherwise
	"""
	lang_info = LANGUAGES.get(language_code)
	if not lang_info:
	return False

	supported_codes = fetch_elevenlabs_turbo_v3()

	# Check alpha3_b code first (3-letter code, bibliographic)
	if lang_info['alpha3_b'] and lang_info['alpha3_b'] in supported_codes:
	return True

	# Check alpha3_t code (3-letter code, terminological)
	if lang_info['alpha3_t'] and lang_info['alpha3_t'] in supported_codes:
	return True

	return False

	def get_aws_locales_for_language(language_code):
	"""
	Get AWS locales for a language using its alpha2 code
	Returns list of matching locales from AWS Transcribe and Polly
	"""
	lang_info = LANGUAGES.get(language_code)
	if not lang_info or not lang_info['alpha2']:
	return []

	alpha2 = lang_info['alpha2']
	aws_transcribe = fetch_aws_transcribe_languages()
	aws_polly = fetch_aws_polly_languages()

	# Find all locales that start with the alpha2 code
	matching_locales = set()

	for locale in aws_transcribe.keys():
	if locale.startswith(alpha2 + '-') or locale == alpha2:
	matching_locales.add(locale)

	for locale in aws_polly.keys():
	if locale.startswith(alpha2 + '-') or locale == alpha2:
	matching_locales.add(locale)

	return sorted(matching_locales)

	def search_huggingface_models(language_code, pipeline_tag, max_results=100, max_pages=3):
	"""
	Search HuggingFace for models supporting a specific language
	pipeline_tag: 'automatic-speech-recognition' or 'text-to-speech'
	max_results: maximum number of models to return
	max_pages: maximum number of pages to search per language code
	Returns tuple: (list of model dictionaries, log messages)
	"""
	lang_info = LANGUAGES.get(language_code)
	logs = []

	if not lang_info:
	logs.append(f"No language info found for code: {language_code}")
	return [], logs

	# Try multiple language code formats
	codes_to_try = []
	if lang_info['alpha2']:
	codes_to_try.append(lang_info['alpha2']) # 2-letter code
	if lang_info['alpha3_b']:
	codes_to_try.append(lang_info['alpha3_b']) # 3-letter code
	if lang_info['alpha3_t']:
	codes_to_try.append(lang_info['alpha3_t']) # 3-letter terminological

	logs.append(f"Language codes to search: {set(codes_to_try)}")

	models = []
	seen_models = set()

	for code in codes_to_try:
	if len(models) >= max_results:
	break

	logs.append(f"Searching for language code: {code}")

	# Try multiple pages for this language code
	for page in range(max_pages):
	if len(models) >= max_results:
	break

	try:
	# Use HuggingFace model search with pagination
	url = f"https://huggingface.co/models?pipeline_tag={pipeline_tag}&language={code}&sort=trending"
	if page > 0:
	url += f"&p={page}"

	logs.append(f" Page {page}: {url}")

	headers = {
	'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
	}

	response = requests.get(url, headers=headers, timeout=10)
	response.raise_for_status()

	soup = BeautifulSoup(response.content, 'html.parser')

	# Parse model cards from the page
	model_cards = soup.find_all('article', class_='overview-card-wrapper')

	if not model_cards:
	logs.append(f" No model cards found on page {page}")
	break

	logs.append(f" Found {len(model_cards)} model cards on page {page}")

	for card in model_cards:
	if len(models) >= max_results:
	break

	try:
	link = card.find('a', href=True)
	if link:
	href = link.get('href', '')
	model_name = href.lstrip('/')

	if model_name and model_name != '#' and model_name not in seen_models:
	seen_models.add(model_name)

	# Parse stats directly from the card HTML by looking at SVG icons
	downloads = 0
	likes = 0
	size = ""

	# Find all SVG elements in the card
	svgs = card.find_all('svg')

	for svg in svgs:
	# Get the next sibling text after the SVG
	# Could be direct text or text within a span/other element
	next_elem = svg.find_next_sibling(string=True)
	stat_text = ""

	if next_elem and next_elem.strip():
	stat_text = next_elem.strip()
	else:
	# Try to find text in the next sibling element (e.g., <span>)
	next_tag = svg.find_next_sibling()
	if next_tag:
	stat_text = next_tag.get_text(strip=True)

	if not stat_text or len(stat_text) < 1:
	continue

	# Identify icon type by viewBox or path content
	svg_str = str(svg)

	# Download icon: viewBox="0 0 32 32" with download arrow path
	if 'M26 24v4H6v-4H4v4a2 2 0 0 0 2 2h20a2 2 0 0 0 2-2v-4zm0-10l-1.41-1.41L17 20.17V2h-2v18.17l-7.59-7.58L6 14l10 10l10-10z' in svg_str:
	downloads = parse_stat_number(stat_text)

	# Like/heart icon: heart path
	elif 'M22.45,6a5.47,5.47,0,0,1,3.91,1.64,5.7,5.7,0,0,1,0,8L16,26.13' in svg_str:
	likes = parse_stat_number(stat_text)

	# Model size icon: small grid icon (viewBox="0 0 12 12") with specific path for parameter count
	elif 'M10 10H8.4V8.4H10V10Zm0-3.2H8.4V5.2H10v1.6ZM6.8 10H5.2V8.4h1.6V10Z' in svg_str:
	# Model parameter count (e.g., "2B", "0.6B")
	# Must be short and contain B for billion params
	if len(stat_text) <= 6 and re.search(r'\d+\.?\d\s[Bb]', stat_text):
	size = stat_text

	models.append({
	'name': model_name,
	'url': f"https://huggingface.co/{model_name}",
	'downloads': downloads,
	'likes': likes,
	'size': size
	})
	except Exception as e:
	logs.append(f" Error parsing model card: {e}")
	continue

	except Exception as e:
	logs.append(f" ERROR searching page {page}: {e}")
	break

	# Sort by downloads (descending)
	models.sort(key=lambda x: x['downloads'], reverse=True)

	logs.append(f"Total unique models found: {len(models)}")
	return models, logs

	def get_huggingface_stats(item_name, item_type='datasets'):
	"""
	Get likes and downloads for a HuggingFace dataset or model using API
	item_type: 'datasets' or 'models'
	Returns dict with likes and downloads

	NOTE: This method is currently NOT USED. We parse stats directly from HTML instead.
	Keeping it here as a fallback in case HTML parsing fails.
	"""
	try:
	api_url = f"https://huggingface.co/api/{item_type}/{item_name}"
	response = requests.get(api_url, timeout=5)

	if response.status_code == 200:
	data = response.json()
	return {
	'likes': data.get('likes', 0),
	'downloads': data.get('downloads', 0)
	}
	except Exception:
	pass

	return {'likes': 0, 'downloads': 0}

	def parse_stat_number(stat_text):
	"""
	Parse HuggingFace stat numbers like '4.07M', '23.4k', '349' into integers
	Returns integer value or 0 if parsing fails
	"""
	if not stat_text:
	return 0

	stat_text = stat_text.strip().upper()

	try:
	# Handle 'M' (millions)
	if 'M' in stat_text:
	return int(float(stat_text.replace('M', '')) * 1_000_000)
	# Handle 'K' (thousands)
	elif 'K' in stat_text:
	return int(float(stat_text.replace('K', '')) * 1_000)
	# Plain number
	else:
	return int(stat_text.replace(',', ''))
	except (ValueError, AttributeError):
	return 0

	def deduplicate_models(models):
	"""
	Deduplicate models by base name (without user/org prefix)
	Keep the model with most downloads and count duplicates
	Returns list of deduplicated models with duplicate count added
	"""
	from collections import defaultdict

	# Group models by base name
	grouped = defaultdict(list)
	for model in models:
	# Extract base name (everything after last '/')
	name_parts = model['name'].split('/')
	if len(name_parts) > 1:
	base_name = name_parts[-1] # e.g., "whisper-large-v3"
	else:
	base_name = model['name']

	grouped[base_name].append(model)

	# For each group, keep the one with most downloads
	deduplicated = []
	for base_name, model_list in grouped.items():
	# Sort by downloads (descending) and keep the first one
	model_list.sort(key=lambda x: x['downloads'], reverse=True)
	best_model = model_list[0]

	# Add duplicate count (total in group)
	best_model['duplicates'] = len(model_list) - 1

	deduplicated.append(best_model)

	# Sort by downloads again
	deduplicated.sort(key=lambda x: x['downloads'], reverse=True)

	return deduplicated

	def search_huggingface_datasets(language_code, task_category, max_results=100, max_pages=3):
	"""
	Search HuggingFace for datasets supporting a specific language
	task_category: 'automatic-speech-recognition' or 'text-to-speech'
	max_results: maximum number of datasets to return
	max_pages: maximum number of pages to search per language code
	Returns tuple: (list of dataset dictionaries, log messages)
	"""
	lang_info = LANGUAGES.get(language_code)
	logs = []

	if not lang_info:
	logs.append(f"No language info found for code: {language_code}")
	return [], logs

	# Collect all unique language codes for this language
	language_codes = set()
	if lang_info['alpha2']:
	language_codes.add(lang_info['alpha2']) # 2-letter code
	if lang_info['alpha3_b']:
	language_codes.add(lang_info['alpha3_b']) # 3-letter code
	if lang_info['alpha3_t']:
	language_codes.add(lang_info['alpha3_t']) # 3-letter terminological

	logs.append(f"Language codes to search: {language_codes}")

	datasets = []
	seen_datasets = set()

	# Search separately for each language code
	for code in language_codes:
	if len(datasets) >= max_results:
	break

	logs.append(f"Searching for language code: {code}")

	for page in range(max_pages):
	if len(datasets) >= max_results:
	break

	try:
	# Use HuggingFace dataset search with correct format
	# Format: task_categories=task_categories:automatic-speech-recognition&language=language:en
	url = f"https://huggingface.co/datasets?task_categories=task_categories:{task_category}&language=language:{code}&sort=trending"
	if page > 0:
	url += f"&p={page}"

	logs.append(f" Page {page}: {url}")

	headers = {
	'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
	}

	response = requests.get(url, headers=headers, timeout=10)
	response.raise_for_status()

	soup = BeautifulSoup(response.content, 'html.parser')

	# Parse dataset cards from the page
	dataset_cards = soup.find_all('article', class_='overview-card-wrapper')

	if not dataset_cards:
	logs.append(f" No dataset cards found on page {page}")
	break

	logs.append(f" Found {len(dataset_cards)} dataset cards on page {page}")

	for card in dataset_cards:
	if len(datasets) >= max_results:
	break

	try:
	link = card.find('a', href=True)
	if link:
	href = link.get('href', '')
	dataset_path = href.lstrip('/')

	# Remove "datasets/" prefix if present
	if dataset_path.startswith('datasets/'):
	dataset_name = dataset_path[9:] # Remove "datasets/" (9 chars)
	else:
	dataset_name = dataset_path

	if dataset_name and dataset_name != '#' and dataset_name not in seen_datasets:
	seen_datasets.add(dataset_name)

	# Parse stats directly from the card HTML by looking at SVG icons
	downloads = 0
	likes = 0
	size = ""

	# Find all SVG elements in the card
	svgs = card.find_all('svg')

	for svg in svgs:
	# Get the next sibling text after the SVG
	# Could be direct text or text within a span/other element
	next_elem = svg.find_next_sibling(string=True)
	stat_text = ""

	if next_elem and next_elem.strip():
	stat_text = next_elem.strip()
	else:
	# Try to find text in the next sibling element (e.g., <span>)
	next_tag = svg.find_next_sibling()
	if next_tag:
	stat_text = next_tag.get_text(strip=True)

	# Skip non-numeric text like "Viewer", "Updated", etc.
	if not stat_text or len(stat_text) < 1 or stat_text in ['Viewer', 'Updated']:
	continue

	# Identify icon type by viewBox or path content
	svg_str = str(svg)

	# Download icon: viewBox="0 0 32 32" with download arrow path
	if 'M26 24v4H6v-4H4v4a2 2 0 0 0 2 2h20a2 2 0 0 0 2-2v-4zm0-10l-1.41-1.41L17 20.17V2h-2v18.17l-7.59-7.58L6 14l10 10l10-10z' in svg_str:
	downloads = parse_stat_number(stat_text)

	# Like/heart icon: heart path
	elif 'M22.45,6a5.47,5.47,0,0,1,3.91,1.64,5.7,5.7,0,0,1,0,8L16,26.13' in svg_str:
	likes = parse_stat_number(stat_text)

	# Dataset size icon: table/grid icon with fill-rule="evenodd"
	elif 'fill-rule="evenodd"' in svg_str and 'clip-rule="evenodd"' in svg_str:
	# Dataset size (e.g., "411k", "23.4M", "65.1k")
	# Must look like a number (has k, M, or digits)
	if any(c in stat_text for c in ['k', 'K', 'm', 'M']) or stat_text.replace(',', '').replace('.', '').isdigit():
	size = stat_text

	datasets.append({
	'name': dataset_name,
	'url': f"https://huggingface.co/datasets/{dataset_name}",
	'downloads': downloads,
	'likes': likes,
	'size': size
	})
	except Exception as e:
	logs.append(f" Error parsing dataset card: {e}")
	continue

	except Exception as e:
	logs.append(f" ERROR searching page {page}: {e}")
	break

	# Sort by downloads (descending)
	datasets.sort(key=lambda x: x['downloads'], reverse=True)

	logs.append(f"Total unique datasets found: {len(datasets)}")
	return datasets, logs

	def search_language_resources(language_code, deduplicate=False):
	"""
	Search for ASR/TTS resources for a given language
	Returns results organized by service type
	deduplicate: if True, remove duplicate models (same base name) and keep only the one with most downloads
	"""
	all_logs = []

	if not language_code:
	return None, None, None, 0, 0, None, None, 0, 0, ""

	lang_info = LANGUAGES.get(language_code)
	if not lang_info:
	return None, None, None, 0, 0, None, None, 0, 0, ""

	language_name = lang_info['name']
	all_logs.append(f"=== Searching for {language_name} ({language_code}) ===")
	all_logs.append(f"Language codes: alpha2={lang_info['alpha2']}, alpha3_b={lang_info['alpha3_b']}, alpha3_t={lang_info['alpha3_t']}")

	# Fetch Azure data
	all_logs.append("\n[Azure Speech Services]")
	azure_asr = fetch_azure_asr_languages()
	azure_tts = fetch_azure_tts_languages()
	all_logs.append(f" Fetched {len(azure_asr)} ASR languages and {len(azure_tts)} TTS languages from Azure")

	# Get matching Azure locales using alpha2 code
	azure_locales = get_azure_locales_for_language(language_code)
	all_logs.append(f" Matching Azure locales: {azure_locales}")

	# Check Azure ASR support
	azure_asr_locales = [loc for loc in azure_locales if loc in azure_asr]
	azure_asr_available = len(azure_asr_locales) > 0
	all_logs.append(f" Azure ASR: {'✅ Supported' if azure_asr_available else '❌ Not supported'} ({len(azure_asr_locales)} locales)")

	# Check Azure TTS support and count voices
	azure_tts_locales = [loc for loc in azure_locales if loc in azure_tts]
	azure_tts_available = len(azure_tts_locales) > 0
	azure_total_voices = sum(azure_tts[loc]['voice_count'] for loc in azure_tts_locales)
	all_logs.append(f" Azure TTS: {'✅ Supported' if azure_tts_available else '❌ Not supported'} ({len(azure_tts_locales)} locales, {azure_total_voices} voices)")

	# Fetch Google Cloud data
	all_logs.append("\n[Google Cloud Speech]")
	google_stt = fetch_google_stt_languages()
	google_tts = fetch_google_tts_languages()
	all_logs.append(f" Fetched {len(google_stt)} STT languages and {len(google_tts)} TTS languages from Google Cloud")

	# Get matching Google Cloud locales using alpha2 code
	google_locales = get_google_locales_for_language(language_code)
	all_logs.append(f" Matching Google Cloud locales: {google_locales}")

	# Check Google Cloud STT support
	google_stt_locales = [loc for loc in google_locales if loc in google_stt]
	google_stt_available = len(google_stt_locales) > 0
	all_logs.append(f" Google STT: {'✅ Supported' if google_stt_available else '❌ Not supported'} ({len(google_stt_locales)} locales)")

	# Check Google Cloud TTS support and count voices
	google_tts_locales = [loc for loc in google_locales if loc in google_tts]
	google_tts_available = len(google_tts_locales) > 0
	google_total_voices = sum(google_tts[loc]['voice_count'] for loc in google_tts_locales)
	all_logs.append(f" Google TTS: {'✅ Supported' if google_tts_available else '❌ Not supported'} ({len(google_tts_locales)} locales, {google_total_voices} voices)")

	# Fetch AWS data
	all_logs.append("\n[AWS (Transcribe + Polly)]")
	aws_transcribe = fetch_aws_transcribe_languages()
	aws_polly = fetch_aws_polly_languages()
	all_logs.append(f" Fetched {len(aws_transcribe)} Transcribe languages and {len(aws_polly)} Polly languages from AWS")

	# Get matching AWS locales using alpha2 code
	aws_locales = get_aws_locales_for_language(language_code)
	all_logs.append(f" Matching AWS locales: {aws_locales}")

	# Check AWS Transcribe support
	aws_transcribe_locales = [loc for loc in aws_locales if loc in aws_transcribe]
	aws_transcribe_available = len(aws_transcribe_locales) > 0
	all_logs.append(f" AWS Transcribe: {'✅ Supported' if aws_transcribe_available else '❌ Not supported'} ({len(aws_transcribe_locales)} locales)")

	# Check AWS Polly support and count voices
	aws_polly_locales = [loc for loc in aws_locales if loc in aws_polly]
	aws_polly_available = len(aws_polly_locales) > 0
	aws_total_voices = sum(aws_polly[loc]['voice_count'] for loc in aws_polly_locales)
	all_logs.append(f" AWS Polly: {'✅ Supported' if aws_polly_available else '❌ Not supported'} ({len(aws_polly_locales)} locales, {aws_total_voices} voices)")

	# Commercial Services
	commercial_rows = []

	# Azure Speech
	if azure_asr_available:
	azure_asr_text = f"✅ {len(azure_asr_locales)} locale(s)"
	else:
	azure_asr_text = "❌ N/A"

	if azure_tts_available:
	azure_tts_text = f"✅ {len(azure_tts_locales)} locale(s), {azure_total_voices} voice(s)"
	else:
	azure_tts_text = "❌ N/A"

	commercial_rows.append({
	"Service": "Azure Speech",
	"ASR": azure_asr_text,
	"TTS": azure_tts_text,
	})

	# Google Cloud Speech
	if google_stt_available:
	google_stt_text = f"✅ {len(google_stt_locales)} locale(s)"
	else:
	google_stt_text = "❌ N/A"

	if google_tts_available:
	google_tts_text = f"✅ {len(google_tts_locales)} locale(s), {google_total_voices} voice(s)"
	else:
	google_tts_text = "❌ N/A"

	commercial_rows.append({
	"Service": "Google Cloud Speech",
	"ASR": google_stt_text,
	"TTS": google_tts_text,
	})

	# AWS (Transcribe + Polly)
	if aws_transcribe_available:
	aws_transcribe_text = f"✅ {len(aws_transcribe_locales)} locale(s)"
	else:
	aws_transcribe_text = "❌ N/A"

	if aws_polly_available:
	aws_polly_text = f"✅ {len(aws_polly_locales)} locale(s), {aws_total_voices} voice(s)"
	else:
	aws_polly_text = "❌ N/A"

	commercial_rows.append({
	"Service": "AWS (Transcribe + Polly)",
	"ASR": aws_transcribe_text,
	"TTS": aws_polly_text,
	})

	# ElevenLabs Multilingual v2 (TTS only)
	all_logs.append("\n[ElevenLabs]")
	elevenlabs_v2_supported = check_elevenlabs_multilingual_v2_support(language_code)
	all_logs.append(f" Multilingual v2: {'✅ Supported' if elevenlabs_v2_supported else '❌ Not supported'}")

	if elevenlabs_v2_supported:
	elevenlabs_v2_tts_text = "✅ Supported"
	else:
	elevenlabs_v2_tts_text = "❌ N/A"

	commercial_rows.append({
	"Service": "ElevenLabs Multilingual v2",
	"ASR": "N/A", # ElevenLabs doesn't offer ASR
	"TTS": elevenlabs_v2_tts_text,
	})

	# ElevenLabs Turbo v3 (TTS only)
	elevenlabs_v3_supported = check_elevenlabs_turbo_v3_support(language_code)
	all_logs.append(f" Turbo v3: {'✅ Supported' if elevenlabs_v3_supported else '❌ Not supported'}")

	if elevenlabs_v3_supported:
	elevenlabs_v3_tts_text = "✅ Supported"
	else:
	elevenlabs_v3_tts_text = "❌ N/A"

	commercial_rows.append({
	"Service": "ElevenLabs Turbo v3",
	"ASR": "N/A", # ElevenLabs doesn't offer ASR
	"TTS": elevenlabs_v3_tts_text,
	})

	commercial_df = pd.DataFrame(commercial_rows)

	# HuggingFace Models - Search for real ASR and TTS models
	all_logs.append("\n[HuggingFace Models]")

	asr_models, asr_model_logs = search_huggingface_models(language_code, 'automatic-speech-recognition', max_results=100, max_pages=5)
	all_logs.extend([f" [ASR] {log}" for log in asr_model_logs])

	tts_models, tts_model_logs = search_huggingface_models(language_code, 'text-to-speech', max_results=100, max_pages=5)
	all_logs.extend([f" [TTS] {log}" for log in tts_model_logs])

	# Apply deduplication if requested
	if deduplicate:
	all_logs.append(f"\n[Deduplication]")
	asr_before = len(asr_models)
	asr_models = deduplicate_models(asr_models)
	all_logs.append(f" ASR models: {asr_before} → {len(asr_models)} (removed {asr_before - len(asr_models)} duplicates)")

	tts_before = len(tts_models)
	tts_models = deduplicate_models(tts_models)
	all_logs.append(f" TTS models: {tts_before} → {len(tts_models)} (removed {tts_before - len(tts_models)} duplicates)")
	else:
	# Add duplicates count of 1 for all models when not deduplicating
	for model in asr_models:
	model['duplicates'] = 1
	for model in tts_models:
	model['duplicates'] = 1

	# Format ASR models with clickable names
	asr_models_data = []
	for model in asr_models:
	asr_models_data.append({
	"Model Name": f"[{model['name']}]({model['url']})",
	"Downloads": model['downloads'],
	"Likes": model['likes'],
	"Size": model.get('size', ''),
	"Duplicates": model.get('duplicates', 1)
	})

	if asr_models_data:
	asr_models_df = pd.DataFrame(asr_models_data)
	else:
	# Empty dataframe if no models found
	asr_models_df = pd.DataFrame(columns=["Model Name", "Downloads", "Likes", "Size", "Duplicates"])

	# Format TTS models with clickable names
	tts_models_data = []
	for model in tts_models:
	tts_models_data.append({
	"Model Name": f"[{model['name']}]({model['url']})",
	"Downloads": model['downloads'],
	"Likes": model['likes'],
	"Size": model.get('size', ''),
	"Duplicates": model.get('duplicates', 1)
	})

	if tts_models_data:
	tts_models_df = pd.DataFrame(tts_models_data)
	else:
	# Empty dataframe if no models found
	tts_models_df = pd.DataFrame(columns=["Model Name", "Downloads", "Likes", "Size", "Duplicates"])

	# HuggingFace Datasets - Search for real ASR and TTS datasets
	all_logs.append("\n[HuggingFace Datasets]")
	asr_datasets, asr_dataset_logs = search_huggingface_datasets(language_code, 'automatic-speech-recognition', max_results=100, max_pages=5)
	all_logs.extend([f" [ASR] {log}" for log in asr_dataset_logs])

	tts_datasets, tts_dataset_logs = search_huggingface_datasets(language_code, 'text-to-speech', max_results=100, max_pages=5)
	all_logs.extend([f" [TTS] {log}" for log in tts_dataset_logs])

	# Format ASR datasets with clickable names
	asr_datasets_data = []
	for dataset in asr_datasets:
	asr_datasets_data.append({
	"Dataset Name": f"[{dataset['name']}]({dataset['url']})",
	"Downloads": dataset['downloads'],
	"Likes": dataset['likes'],
	"Size": dataset.get('size', '')
	})

	if asr_datasets_data:
	asr_datasets_df = pd.DataFrame(asr_datasets_data)
	else:
	# Empty dataframe if no datasets found
	asr_datasets_df = pd.DataFrame(columns=["Dataset Name", "Downloads", "Likes", "Size"])

	# Format TTS datasets with clickable names
	tts_datasets_data = []
	for dataset in tts_datasets:
	tts_datasets_data.append({
	"Dataset Name": f"[{dataset['name']}]({dataset['url']})",
	"Downloads": dataset['downloads'],
	"Likes": dataset['likes'],
	"Size": dataset.get('size', '')
	})

	if tts_datasets_data:
	tts_datasets_df = pd.DataFrame(tts_datasets_data)
	else:
	# Empty dataframe if no datasets found
	tts_datasets_df = pd.DataFrame(columns=["Dataset Name", "Downloads", "Likes", "Size"])

	# Combine all logs
	log_text = "\n".join(all_logs)

	# Return separate ASR and TTS dataframes, plus counts for tab labels, plus logs
	return commercial_df, asr_models_df, tts_models_df, len(asr_models), len(tts_models), asr_datasets_df, tts_datasets_df, len(asr_datasets), len(tts_datasets), log_text

	# Initialize - load language list and app content
	print("Initializing Speech Resource Finder...")
	load_app_content()
	load_language_list()
	load_language_taxonomy()

	# Create language choices for dropdown (code: name format for easy searching)
	language_choices = [f"{code}: {info['name']}" for code, info in sorted(LANGUAGES.items(), key=lambda x: x[1]['name'])]
	print(f"Created dropdown with {len(language_choices)} language options")

	with gr.Blocks(title=APP_CONTENT["title"]) as demo:
	gr.Markdown(f"# {APP_CONTENT['title']}")
	gr.Markdown(APP_CONTENT["description"])

	with gr.Row(equal_height=True):
	with gr.Column(scale=2):
	language_dropdown = gr.Dropdown(
	choices=language_choices,
	label="Select Language",
	info="Type to search for a language",
	allow_custom_value=False,
	filterable=True,
	)
	with gr.Column(scale=1):
	language_metadata = gr.HTML(
	"""<div style='padding: 15px; border: 2px solid #e0e0e0; border-radius: 4px; background-color: #fafafa; height: 100%; display: flex; align-items: center; justify-content: center; box-sizing: border-box;'>
	<p style='margin: 0; color: #333; font-size: 14px;'>Select a language to see resource classification</p>
	</div>""",
	elem_id="language-metadata"
	)

	gr.Markdown("## Commercial Services")
	commercial_table = gr.Dataframe(
	headers=["Service", "ASR", "TTS"],
	interactive=False,
	wrap=True,
	)

	gr.Markdown("## HuggingFace Models")

	with gr.Row():
	deduplicate_checkbox = gr.Checkbox(
	label="Deduplicate models",
	value=True,
	info="Keep only the model with most downloads for each base name"
	)

	# Create tabs for ASR and TTS models with count labels
	with gr.Tabs():
	with gr.Tab(label="ASR Models") as asr_tab:
	asr_count_label = gr.Markdown("Loading...")
	asr_models_table = gr.Dataframe(
	headers=["Model Name", "Downloads", "Likes", "Size", "Duplicates"],
	interactive=False,
	wrap=True,
	datatype=["markdown", "number", "number", "str", "number"],
	)

	with gr.Tab(label="TTS Models") as tts_tab:
	tts_count_label = gr.Markdown("Loading...")
	tts_models_table = gr.Dataframe(
	headers=["Model Name", "Downloads", "Likes", "Size", "Duplicates"],
	interactive=False,
	wrap=True,
	datatype=["markdown", "number", "number", "str", "number"],
	)

	gr.Markdown("## HuggingFace Datasets")

	# Create tabs for ASR and TTS datasets with count labels
	with gr.Tabs():
	with gr.Tab(label="ASR Datasets") as asr_datasets_tab:
	asr_datasets_count_label = gr.Markdown("Loading...")
	asr_datasets_table = gr.Dataframe(
	headers=["Dataset Name", "Downloads", "Likes", "Size"],
	interactive=False,
	wrap=True,
	datatype=["markdown", "number", "number", "str"],
	)

	with gr.Tab(label="TTS Datasets") as tts_datasets_tab:
	tts_datasets_count_label = gr.Markdown("Loading...")
	tts_datasets_table = gr.Dataframe(
	headers=["Dataset Name", "Downloads", "Likes", "Size"],
	interactive=False,
	wrap=True,
	datatype=["markdown", "number", "number", "str"],
	)

	with gr.Accordion("Logs", open=False):
	log_textbox = gr.Textbox(
	show_label=False,
	lines=15,
	max_lines=30,
	interactive=False,
	placeholder="Logs will appear here...",
	autoscroll=True,
	)

	# About section with full content
	with gr.Accordion("About this tool", open=False):
	gr.Markdown(APP_CONTENT["full_content"])

	def on_search(language_selection, deduplicate):
	if not language_selection:
	default_html = """<div style='padding: 15px; border: 2px solid #e0e0e0; border-radius: 4px; background-color: #fafafa; height: 100%; display: flex; align-items: center; justify-content: center; box-sizing: border-box;'>
	<p style='margin: 0; color: #333; font-size: 14px;'>Select a language to see resource classification</p>
	</div>"""
	return default_html, None, "", None, "", None, "", None, "", None, ""
	# Extract the language code from "code: name" format
	language_code = language_selection.split(":")[0].strip()

	# Get language name for taxonomy lookup
	language_name = LANGUAGES.get(language_code, {}).get("name", "")

	# Get taxonomy classification
	level, classification = get_language_taxonomy_info(language_name)

	# Create metadata display with color coding
	if level is not None:
	color = get_taxonomy_color(level)
	metadata_html = f"""<div style='padding: 15px; border: 2px solid {color}; border-radius: 4px; background-color: #ffffff; height: 100%; box-sizing: border-box; display: flex; flex-direction: column; justify-content: center;'>
	<h4 style='margin: 0 0 8px 0; color: #333; font-size: 16px;'>{language_name}</h4>
	<div style='margin: 8px 0;'>
	<span style='padding: 6px 12px; background-color: {color}; color: white; border-radius: 4px; font-weight: bold; font-size: 13px;'>{classification}</span>
	</div>
	<p style='margin: 8px 0 0 0; font-size: 11px; color: #555;'>Source: <a href='https://microsoft.github.io/linguisticdiversity/' target='_blank' style='color: #1976d2; text-decoration: none;'>Joshi et al.</a></p>
	</div>"""
	else:
	metadata_html = f"""<div style='padding: 15px; border: 2px solid #757575; border-radius: 4px; background-color: #ffffff; height: 100%; box-sizing: border-box; display: flex; flex-direction: column; justify-content: center;'>
	<h4 style='margin: 0 0 8px 0; color: #333; font-size: 16px;'>{language_name}</h4>
	<div style='margin: 8px 0;'>
	<span style='padding: 6px 12px; background-color: #757575; color: white; border-radius: 4px; font-weight: bold; font-size: 13px;'>Unknown</span>
	</div>
	<p style='margin: 8px 0 0 0; font-size: 11px; color: #555;'>Source: <a href='https://microsoft.github.io/linguisticdiversity/' target='_blank' style='color: #1976d2; text-decoration: none;'>Joshi et al.</a></p>
	</div>"""

	commercial_df, asr_models_df, tts_models_df, asr_models_count, tts_models_count, asr_datasets_df, tts_datasets_df, asr_datasets_count, tts_datasets_count, logs = search_language_resources(language_code, deduplicate=deduplicate)

	# Create count labels
	asr_models_label = f"Found {asr_models_count} ASR model(s)"
	tts_models_label = f"Found {tts_models_count} TTS model(s)"
	asr_datasets_label = f"Found {asr_datasets_count} ASR dataset(s)"
	tts_datasets_label = f"Found {tts_datasets_count} TTS dataset(s)"

	return metadata_html, commercial_df, asr_models_label, asr_models_df, tts_models_label, tts_models_df, asr_datasets_label, asr_datasets_df, tts_datasets_label, tts_datasets_df, logs

	# Trigger search when language is selected
	language_dropdown.change(
	fn=on_search,
	inputs=[language_dropdown, deduplicate_checkbox],
	outputs=[language_metadata, commercial_table, asr_count_label, asr_models_table, tts_count_label, tts_models_table, asr_datasets_count_label, asr_datasets_table, tts_datasets_count_label, tts_datasets_table, log_textbox],
	)

	# Trigger search when deduplicate checkbox is changed
	deduplicate_checkbox.change(
	fn=on_search,
	inputs=[language_dropdown, deduplicate_checkbox],
	outputs=[language_metadata, commercial_table, asr_count_label, asr_models_table, tts_count_label, tts_models_table, asr_datasets_count_label, asr_datasets_table, tts_datasets_count_label, tts_datasets_table, log_textbox],
	)

	if __name__ == "__main__":
	demo.launch(server_name="0.0.0.0", server_port=7860, share=False, show_error=True)