import gradio as gr import pandas as pd import requests from bs4 import BeautifulSoup from functools import lru_cache import csv from io import StringIO import re # Configuration LANGUAGE_CODES_FILE = "language-codes-full.csv" APP_CONTENT_FILE = "app_content.md" LANGUAGE_TAXONOMY_URL = "https://microsoft.github.io/linguisticdiversity/assets/lang2tax.txt" # Language list will be loaded from CSV # Structure: {alpha3_b: {"name": str, "alpha3_t": str, "alpha2": str}} LANGUAGES = {} # Language taxonomy mapping (from Joshi et al.'s linguistic diversity paper) # Structure: {language_name_lowercase: level} LANGUAGE_TAXONOMY = {} # Taxonomy level descriptions TAXONOMY_LEVELS = { 0: "The Left-Behinds", 1: "The Scraping-Bys", 2: "The Hopefuls", 3: "The Rising Stars", 4: "The Underdogs", 5: "The Winners" } # App content will be loaded from markdown file APP_CONTENT = { "title": "Speech Resource Finder", "description": "Search for speech resources", "full_content": "" } def load_app_content(content_path=None): """Load app content from markdown file""" global APP_CONTENT if content_path is None: content_path = APP_CONTENT_FILE try: with open(content_path, 'r', encoding='utf-8') as f: content = f.read() # Parse markdown content lines = content.split('\n') # Extract title (first # heading) title = "Speech Resource Finder" for line in lines: if line.startswith('# '): title = line[2:].strip() break # Extract description (text after ## Description until next ##) description = "" in_description = False for line in lines: if line.startswith('## Description'): in_description = True continue elif in_description and line.startswith('##'): break elif in_description and line.strip(): description += line.strip() + " " APP_CONTENT = { "title": title, "description": description.strip(), "full_content": content } print(f"Loaded app content from {content_path}") except Exception as e: print(f"Error loading app content: {e}") print("Using default content") def load_language_list(csv_path=None): """Load ISO 639 language codes from CSV file""" global LANGUAGES if csv_path is None: csv_path = LANGUAGE_CODES_FILE try: with open(csv_path, 'r', encoding='utf-8') as f: reader = csv.DictReader(f) for row in reader: # Use alpha3-b as primary key, fallback to alpha3-t if empty code_b = row['alpha3-b'].strip() code_t = row['alpha3-t'].strip() code_2 = row['alpha2'].strip() name = row['English'].strip() primary_code = code_b if code_b else code_t if primary_code and name: LANGUAGES[primary_code] = { "name": name, "alpha3_b": code_b, "alpha3_t": code_t, "alpha2": code_2 } print(f"Loaded {len(LANGUAGES)} languages from {csv_path}") except Exception as e: print(f"Error loading language list: {e}") # Fallback to a minimal set LANGUAGES = { "eng": {"name": "English", "alpha3_b": "eng", "alpha3_t": "", "alpha2": "en"}, "spa": {"name": "Spanish", "alpha3_b": "spa", "alpha3_t": "", "alpha2": "es"}, "fra": {"name": "French", "alpha3_b": "fra", "alpha3_t": "", "alpha2": "fr"}, "deu": {"name": "German", "alpha3_b": "ger", "alpha3_t": "deu", "alpha2": "de"}, } print(f"Using fallback with {len(LANGUAGES)} languages") def load_language_taxonomy(): """Load language taxonomy data from Microsoft's linguistic diversity project""" global LANGUAGE_TAXONOMY try: response = requests.get(LANGUAGE_TAXONOMY_URL, timeout=10) response.raise_for_status() # Parse the CSV-like content (format: language_name,level) for line in response.text.strip().split('\n'): if line.strip(): parts = line.strip().split(',') if len(parts) == 2: lang_name = parts[0].strip().lower() level = int(parts[1].strip()) LANGUAGE_TAXONOMY[lang_name] = level print(f"Loaded taxonomy data for {len(LANGUAGE_TAXONOMY)} languages") except Exception as e: print(f"Warning: Could not load language taxonomy: {e}") print("Language classification will show as 'Unknown'") def get_taxonomy_color(level): """ Get color code for taxonomy level (red for left-behind, green for winners) """ colors = { 0: "#d32f2f", # Red - The Left-Behinds 1: "#f57c00", # Orange - The Scraping-Bys 2: "#fbc02d", # Yellow - The Hopefuls 3: "#afb42b", # Yellow-green - The Rising Stars 4: "#7cb342", # Light green - The Underdogs 5: "#388e3c", # Green - The Winners } return colors.get(level, "#757575") # Gray for unknown def get_language_taxonomy_info(language_name): """ Get taxonomy classification for a language. Returns a tuple of (level, description) or (None, "Unknown") """ if not language_name: return None, "Unknown" # Try exact match (case-insensitive) lang_lower = language_name.lower() if lang_lower in LANGUAGE_TAXONOMY: level = LANGUAGE_TAXONOMY[lang_lower] return level, TAXONOMY_LEVELS.get(level, f"Level {level}") # Try with semicolon-separated alternative names (e.g., "Catalan; Valencian") if ';' in lang_lower: parts = [p.strip() for p in lang_lower.split(';')] for part in parts: if part in LANGUAGE_TAXONOMY: level = LANGUAGE_TAXONOMY[part] return level, TAXONOMY_LEVELS.get(level, f"Level {level}") # Try with comma-separated variations (e.g., "Chinese, Mandarin") if ',' in lang_lower: parts = [p.strip() for p in lang_lower.split(',')] for part in parts: if part in LANGUAGE_TAXONOMY: level = LANGUAGE_TAXONOMY[part] return level, TAXONOMY_LEVELS.get(level, f"Level {level}") return None, "Unknown" @lru_cache(maxsize=1) def fetch_azure_asr_languages(): """Scrape Azure Speech-to-Text supported languages""" url = "https://learn.microsoft.com/en-us/azure/ai-services/speech-service/language-support?tabs=stt" try: response = requests.get(url, timeout=10) response.raise_for_status() soup = BeautifulSoup(response.content, 'html.parser') # Find the table with locale data # The table has columns: Locale (BCP-47) | Language | Fast transcription support | Custom speech support tables = soup.find_all('table') azure_asr = {} for table in tables: rows = table.find_all('tr') if not rows: continue # Check if this is the right table by looking at headers headers = [th.get_text(strip=True) for th in rows[0].find_all('th')] if 'Locale' in ' '.join(headers) or 'Language' in ' '.join(headers): for row in rows[1:]: # Skip header cols = row.find_all('td') if len(cols) >= 2: locale = cols[0].get_text(strip=True) language = cols[1].get_text(strip=True) if locale and language: azure_asr[locale] = language break return azure_asr except Exception as e: print(f"Error fetching Azure ASR data: {e}") return {} @lru_cache(maxsize=1) def fetch_azure_tts_languages(): """Scrape Azure Text-to-Speech supported languages with voice counts""" url = "https://learn.microsoft.com/en-us/azure/ai-services/speech-service/language-support?tabs=tts" try: response = requests.get(url, timeout=10) response.raise_for_status() soup = BeautifulSoup(response.content, 'html.parser') # Find the TTS table # Columns: Locale (BCP-47) | Language | Text to speech voices tables = soup.find_all('table') azure_tts = {} for table in tables: rows = table.find_all('tr') if not rows: continue headers = [th.get_text(strip=True) for th in rows[0].find_all('th')] if 'Text to speech' in ' '.join(headers) or 'voices' in ' '.join(headers).lower(): for row in rows[1:]: cols = row.find_all('td') if len(cols) >= 3: locale = cols[0].get_text(strip=True) language = cols[1].get_text(strip=True) voices_text = cols[2].get_text(strip=True) # Count number of voices (look for "Neural" in the text) voice_count = voices_text.count('Neural') if locale and language: azure_tts[locale] = { 'language': language, 'voice_count': voice_count } break return azure_tts except Exception as e: print(f"Error fetching Azure TTS data: {e}") return {} @lru_cache(maxsize=1) def fetch_google_stt_languages(): """Scrape Google Cloud Speech-to-Text supported languages""" url = "https://cloud.google.com/speech-to-text/docs/speech-to-text-supported-languages" try: response = requests.get(url, timeout=10) response.raise_for_status() soup = BeautifulSoup(response.content, 'html.parser') # Find tables with BCP-47 language codes tables = soup.find_all('table') google_stt = {} for table in tables: rows = table.find_all('tr') if not rows: continue # Check if this table has BCP-47 column headers = [th.get_text(strip=True) for th in rows[0].find_all('th')] # Find BCP-47 column index bcp47_idx = None name_idx = None for idx, header in enumerate(headers): if 'BCP-47' in header or 'BCP47' in header: bcp47_idx = idx if 'Name' in header and name_idx is None: name_idx = idx if bcp47_idx is not None: for row in rows[1:]: # Skip header cols = row.find_all('td') if len(cols) > bcp47_idx: locale = cols[bcp47_idx].get_text(strip=True) language = cols[name_idx].get_text(strip=True) if name_idx and len(cols) > name_idx else '' if locale and locale not in ['—', '-', '']: google_stt[locale] = language return google_stt except Exception as e: print(f"Error fetching Google STT data: {e}") return {} @lru_cache(maxsize=1) def fetch_google_tts_languages(): """Scrape Google Cloud Text-to-Speech supported languages with voice counts""" url = "https://cloud.google.com/text-to-speech/docs/voices" try: response = requests.get(url, timeout=10) response.raise_for_status() soup = BeautifulSoup(response.content, 'html.parser') # Find the voices table # Columns: Language | Voice type | Language code | Voice name | SSML Gender | Sample tables = soup.find_all('table') google_tts = {} for table in tables: rows = table.find_all('tr') if not rows: continue headers = [th.get_text(strip=True) for th in rows[0].find_all('th')] # Find Language code column index lang_code_idx = None for idx, header in enumerate(headers): if 'Language code' in header or 'language code' in header.lower(): lang_code_idx = idx break if lang_code_idx is not None: for row in rows[1:]: cols = row.find_all('td') if len(cols) > lang_code_idx: locale = cols[lang_code_idx].get_text(strip=True) if locale and locale not in ['—', '-', '']: # Count voices per locale if locale in google_tts: google_tts[locale]['voice_count'] += 1 else: language = cols[0].get_text(strip=True) if len(cols) > 0 else '' google_tts[locale] = { 'language': language, 'voice_count': 1 } return google_tts except Exception as e: print(f"Error fetching Google TTS data: {e}") return {} @lru_cache(maxsize=1) def fetch_elevenlabs_multilingual_v2(): """Get ElevenLabs Multilingual v2 supported languages""" # Based on https://elevenlabs.io/docs/models#multilingual-v2 # These are ISO 639-1 (2-letter) codes supported_codes = { 'en', 'ja', 'zh', 'de', 'hi', 'fr', 'ko', 'pt', 'it', 'es', 'id', 'nl', 'tr', 'fil', 'pl', 'sv', 'bg', 'ro', 'ar', 'cs', 'el', 'fi', 'hr', 'ms', 'sk', 'da', 'ta', 'uk', 'ru' } return supported_codes @lru_cache(maxsize=1) def fetch_elevenlabs_turbo_v3(): """Get ElevenLabs Eleven Turbo v3 (formerly v3 Alpha) supported languages""" # Based on https://elevenlabs.io/docs/models#eleven-v3-alpha # These are ISO 639-3 (3-letter) codes supported_codes = { 'afr', 'ara', 'hye', 'asm', 'aze', 'bel', 'ben', 'bos', 'bul', 'cat', 'ceb', 'nya', 'hrv', 'ces', 'dan', 'nld', 'eng', 'est', 'fil', 'fin', 'fra', 'glg', 'kat', 'deu', 'ell', 'guj', 'hau', 'heb', 'hin', 'hun', 'isl', 'ind', 'gle', 'ita', 'jpn', 'jav', 'kan', 'kaz', 'kir', 'kor', 'lav', 'lin', 'lit', 'ltz', 'mkd', 'msa', 'mal', 'cmn', 'mar', 'nep', 'nor', 'pus', 'fas', 'pol', 'por', 'pan', 'ron', 'rus', 'srp', 'snd', 'slk', 'slv', 'som', 'spa', 'swa', 'swe', 'tam', 'tel', 'tha', 'tur', 'ukr', 'urd', 'vie', 'cym' } return supported_codes @lru_cache(maxsize=1) def fetch_aws_transcribe_languages(): """Scrape AWS Transcribe (ASR) supported languages""" url = "https://docs.aws.amazon.com/transcribe/latest/dg/supported-languages.html" try: response = requests.get(url, timeout=10) response.raise_for_status() soup = BeautifulSoup(response.content, 'html.parser') # Find tables with language codes tables = soup.find_all('table') aws_transcribe = {} for table in tables: rows = table.find_all('tr') if not rows: continue # Check if this table has language code column headers = [th.get_text(strip=True) for th in rows[0].find_all('th')] # Find language code column index lang_code_idx = None lang_name_idx = None for idx, header in enumerate(headers): if 'Language code' in header or 'language code' in header.lower(): lang_code_idx = idx if 'Language' == header or header.startswith('Language'): lang_name_idx = idx if lang_code_idx is not None: for row in rows[1:]: # Skip header cols = row.find_all('td') if len(cols) > lang_code_idx: locale = cols[lang_code_idx].get_text(strip=True) language = cols[lang_name_idx].get_text(strip=True) if lang_name_idx and len(cols) > lang_name_idx else '' if locale and locale not in ['—', '-', '']: aws_transcribe[locale] = language return aws_transcribe except Exception as e: print(f"Error fetching AWS Transcribe data: {e}") return {} @lru_cache(maxsize=1) def fetch_aws_polly_languages(): """Scrape AWS Polly (TTS) supported languages""" url = "https://docs.aws.amazon.com/polly/latest/dg/supported-languages.html" try: response = requests.get(url, timeout=10) response.raise_for_status() soup = BeautifulSoup(response.content, 'html.parser') # Find tables with language codes tables = soup.find_all('table') aws_polly = {} for table in tables: rows = table.find_all('tr') if not rows: continue # Check if this table has language code column headers = [th.get_text(strip=True) for th in rows[0].find_all('th')] # Find language code column index lang_code_idx = None lang_name_idx = None for idx, header in enumerate(headers): if 'Language code' in header or 'language code' in header.lower(): lang_code_idx = idx if 'Language' == header or header.startswith('Language'): lang_name_idx = idx if lang_code_idx is not None: for row in rows[1:]: # Skip header cols = row.find_all('td') if len(cols) > lang_code_idx: locale = cols[lang_code_idx].get_text(strip=True) language = cols[lang_name_idx].get_text(strip=True) if lang_name_idx and len(cols) > lang_name_idx else '' if locale and locale not in ['—', '-', '']: # Count voices per locale (each row is a different voice/locale combo) if locale in aws_polly: aws_polly[locale]['voice_count'] += 1 else: aws_polly[locale] = { 'language': language, 'voice_count': 1 } return aws_polly except Exception as e: print(f"Error fetching AWS Polly data: {e}") return {} def get_azure_locales_for_language(language_code): """ Get Azure BCP-47 locales for a language using its alpha2 code Returns list of matching locales from Azure """ lang_info = LANGUAGES.get(language_code) if not lang_info or not lang_info['alpha2']: return [] alpha2 = lang_info['alpha2'] azure_asr = fetch_azure_asr_languages() azure_tts = fetch_azure_tts_languages() # Find all locales that start with the alpha2 code matching_locales = set() for locale in azure_asr.keys(): if locale.startswith(alpha2 + '-') or locale == alpha2: matching_locales.add(locale) for locale in azure_tts.keys(): if locale.startswith(alpha2 + '-') or locale == alpha2: matching_locales.add(locale) return sorted(matching_locales) def get_google_locales_for_language(language_code): """ Get Google Cloud BCP-47 locales for a language using its alpha2 code Returns list of matching locales from Google Cloud """ lang_info = LANGUAGES.get(language_code) if not lang_info or not lang_info['alpha2']: return [] alpha2 = lang_info['alpha2'] google_stt = fetch_google_stt_languages() google_tts = fetch_google_tts_languages() # Find all locales that start with the alpha2 code matching_locales = set() for locale in google_stt.keys(): if locale.startswith(alpha2 + '-') or locale == alpha2: matching_locales.add(locale) for locale in google_tts.keys(): if locale.startswith(alpha2 + '-') or locale == alpha2: matching_locales.add(locale) return sorted(matching_locales) def check_elevenlabs_multilingual_v2_support(language_code): """ Check if ElevenLabs Multilingual v2 supports a language using ISO 639-1 (alpha2) codes Returns True if supported, False otherwise """ lang_info = LANGUAGES.get(language_code) if not lang_info: return False supported_codes = fetch_elevenlabs_multilingual_v2() # Check alpha2 code (2-letter code) if lang_info['alpha2'] and lang_info['alpha2'] in supported_codes: return True return False def check_elevenlabs_turbo_v3_support(language_code): """ Check if ElevenLabs Turbo v3 supports a language using ISO 639-3 (alpha3) codes Returns True if supported, False otherwise """ lang_info = LANGUAGES.get(language_code) if not lang_info: return False supported_codes = fetch_elevenlabs_turbo_v3() # Check alpha3_b code first (3-letter code, bibliographic) if lang_info['alpha3_b'] and lang_info['alpha3_b'] in supported_codes: return True # Check alpha3_t code (3-letter code, terminological) if lang_info['alpha3_t'] and lang_info['alpha3_t'] in supported_codes: return True return False def get_aws_locales_for_language(language_code): """ Get AWS locales for a language using its alpha2 code Returns list of matching locales from AWS Transcribe and Polly """ lang_info = LANGUAGES.get(language_code) if not lang_info or not lang_info['alpha2']: return [] alpha2 = lang_info['alpha2'] aws_transcribe = fetch_aws_transcribe_languages() aws_polly = fetch_aws_polly_languages() # Find all locales that start with the alpha2 code matching_locales = set() for locale in aws_transcribe.keys(): if locale.startswith(alpha2 + '-') or locale == alpha2: matching_locales.add(locale) for locale in aws_polly.keys(): if locale.startswith(alpha2 + '-') or locale == alpha2: matching_locales.add(locale) return sorted(matching_locales) def search_huggingface_models(language_code, pipeline_tag, max_results=100, max_pages=3): """ Search HuggingFace for models supporting a specific language pipeline_tag: 'automatic-speech-recognition' or 'text-to-speech' max_results: maximum number of models to return max_pages: maximum number of pages to search per language code Returns tuple: (list of model dictionaries, log messages) """ lang_info = LANGUAGES.get(language_code) logs = [] if not lang_info: logs.append(f"No language info found for code: {language_code}") return [], logs # Try multiple language code formats codes_to_try = [] if lang_info['alpha2']: codes_to_try.append(lang_info['alpha2']) # 2-letter code if lang_info['alpha3_b']: codes_to_try.append(lang_info['alpha3_b']) # 3-letter code if lang_info['alpha3_t']: codes_to_try.append(lang_info['alpha3_t']) # 3-letter terminological logs.append(f"Language codes to search: {set(codes_to_try)}") models = [] seen_models = set() for code in codes_to_try: if len(models) >= max_results: break logs.append(f"Searching for language code: {code}") # Try multiple pages for this language code for page in range(max_pages): if len(models) >= max_results: break try: # Use HuggingFace model search with pagination url = f"https://huggingface.co/models?pipeline_tag={pipeline_tag}&language={code}&sort=trending" if page > 0: url += f"&p={page}" logs.append(f" Page {page}: {url}") headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36' } response = requests.get(url, headers=headers, timeout=10) response.raise_for_status() soup = BeautifulSoup(response.content, 'html.parser') # Parse model cards from the page model_cards = soup.find_all('article', class_='overview-card-wrapper') if not model_cards: logs.append(f" No model cards found on page {page}") break logs.append(f" Found {len(model_cards)} model cards on page {page}") for card in model_cards: if len(models) >= max_results: break try: link = card.find('a', href=True) if link: href = link.get('href', '') model_name = href.lstrip('/') if model_name and model_name != '#' and model_name not in seen_models: seen_models.add(model_name) # Parse stats directly from the card HTML by looking at SVG icons downloads = 0 likes = 0 size = "" # Find all SVG elements in the card svgs = card.find_all('svg') for svg in svgs: # Get the next sibling text after the SVG # Could be direct text or text within a span/other element next_elem = svg.find_next_sibling(string=True) stat_text = "" if next_elem and next_elem.strip(): stat_text = next_elem.strip() else: # Try to find text in the next sibling element (e.g., ) next_tag = svg.find_next_sibling() if next_tag: stat_text = next_tag.get_text(strip=True) if not stat_text or len(stat_text) < 1: continue # Identify icon type by viewBox or path content svg_str = str(svg) # Download icon: viewBox="0 0 32 32" with download arrow path if 'M26 24v4H6v-4H4v4a2 2 0 0 0 2 2h20a2 2 0 0 0 2-2v-4zm0-10l-1.41-1.41L17 20.17V2h-2v18.17l-7.59-7.58L6 14l10 10l10-10z' in svg_str: downloads = parse_stat_number(stat_text) # Like/heart icon: heart path elif 'M22.45,6a5.47,5.47,0,0,1,3.91,1.64,5.7,5.7,0,0,1,0,8L16,26.13' in svg_str: likes = parse_stat_number(stat_text) # Model size icon: small grid icon (viewBox="0 0 12 12") with specific path for parameter count elif 'M10 10H8.4V8.4H10V10Zm0-3.2H8.4V5.2H10v1.6ZM6.8 10H5.2V8.4h1.6V10Z' in svg_str: # Model parameter count (e.g., "2B", "0.6B") # Must be short and contain B for billion params if len(stat_text) <= 6 and re.search(r'\d+\.?\d*\s*[Bb]', stat_text): size = stat_text models.append({ 'name': model_name, 'url': f"https://huggingface.co/{model_name}", 'downloads': downloads, 'likes': likes, 'size': size }) except Exception as e: logs.append(f" Error parsing model card: {e}") continue except Exception as e: logs.append(f" ERROR searching page {page}: {e}") break # Sort by downloads (descending) models.sort(key=lambda x: x['downloads'], reverse=True) logs.append(f"Total unique models found: {len(models)}") return models, logs def get_huggingface_stats(item_name, item_type='datasets'): """ Get likes and downloads for a HuggingFace dataset or model using API item_type: 'datasets' or 'models' Returns dict with likes and downloads NOTE: This method is currently NOT USED. We parse stats directly from HTML instead. Keeping it here as a fallback in case HTML parsing fails. """ try: api_url = f"https://huggingface.co/api/{item_type}/{item_name}" response = requests.get(api_url, timeout=5) if response.status_code == 200: data = response.json() return { 'likes': data.get('likes', 0), 'downloads': data.get('downloads', 0) } except Exception: pass return {'likes': 0, 'downloads': 0} def parse_stat_number(stat_text): """ Parse HuggingFace stat numbers like '4.07M', '23.4k', '349' into integers Returns integer value or 0 if parsing fails """ if not stat_text: return 0 stat_text = stat_text.strip().upper() try: # Handle 'M' (millions) if 'M' in stat_text: return int(float(stat_text.replace('M', '')) * 1_000_000) # Handle 'K' (thousands) elif 'K' in stat_text: return int(float(stat_text.replace('K', '')) * 1_000) # Plain number else: return int(stat_text.replace(',', '')) except (ValueError, AttributeError): return 0 def deduplicate_models(models): """ Deduplicate models by base name (without user/org prefix) Keep the model with most downloads and count duplicates Returns list of deduplicated models with duplicate count added """ from collections import defaultdict # Group models by base name grouped = defaultdict(list) for model in models: # Extract base name (everything after last '/') name_parts = model['name'].split('/') if len(name_parts) > 1: base_name = name_parts[-1] # e.g., "whisper-large-v3" else: base_name = model['name'] grouped[base_name].append(model) # For each group, keep the one with most downloads deduplicated = [] for base_name, model_list in grouped.items(): # Sort by downloads (descending) and keep the first one model_list.sort(key=lambda x: x['downloads'], reverse=True) best_model = model_list[0] # Add duplicate count (total in group) best_model['duplicates'] = len(model_list) - 1 deduplicated.append(best_model) # Sort by downloads again deduplicated.sort(key=lambda x: x['downloads'], reverse=True) return deduplicated def search_huggingface_datasets(language_code, task_category, max_results=100, max_pages=3): """ Search HuggingFace for datasets supporting a specific language task_category: 'automatic-speech-recognition' or 'text-to-speech' max_results: maximum number of datasets to return max_pages: maximum number of pages to search per language code Returns tuple: (list of dataset dictionaries, log messages) """ lang_info = LANGUAGES.get(language_code) logs = [] if not lang_info: logs.append(f"No language info found for code: {language_code}") return [], logs # Collect all unique language codes for this language language_codes = set() if lang_info['alpha2']: language_codes.add(lang_info['alpha2']) # 2-letter code if lang_info['alpha3_b']: language_codes.add(lang_info['alpha3_b']) # 3-letter code if lang_info['alpha3_t']: language_codes.add(lang_info['alpha3_t']) # 3-letter terminological logs.append(f"Language codes to search: {language_codes}") datasets = [] seen_datasets = set() # Search separately for each language code for code in language_codes: if len(datasets) >= max_results: break logs.append(f"Searching for language code: {code}") for page in range(max_pages): if len(datasets) >= max_results: break try: # Use HuggingFace dataset search with correct format # Format: task_categories=task_categories:automatic-speech-recognition&language=language:en url = f"https://huggingface.co/datasets?task_categories=task_categories:{task_category}&language=language:{code}&sort=trending" if page > 0: url += f"&p={page}" logs.append(f" Page {page}: {url}") headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36' } response = requests.get(url, headers=headers, timeout=10) response.raise_for_status() soup = BeautifulSoup(response.content, 'html.parser') # Parse dataset cards from the page dataset_cards = soup.find_all('article', class_='overview-card-wrapper') if not dataset_cards: logs.append(f" No dataset cards found on page {page}") break logs.append(f" Found {len(dataset_cards)} dataset cards on page {page}") for card in dataset_cards: if len(datasets) >= max_results: break try: link = card.find('a', href=True) if link: href = link.get('href', '') dataset_path = href.lstrip('/') # Remove "datasets/" prefix if present if dataset_path.startswith('datasets/'): dataset_name = dataset_path[9:] # Remove "datasets/" (9 chars) else: dataset_name = dataset_path if dataset_name and dataset_name != '#' and dataset_name not in seen_datasets: seen_datasets.add(dataset_name) # Parse stats directly from the card HTML by looking at SVG icons downloads = 0 likes = 0 size = "" # Find all SVG elements in the card svgs = card.find_all('svg') for svg in svgs: # Get the next sibling text after the SVG # Could be direct text or text within a span/other element next_elem = svg.find_next_sibling(string=True) stat_text = "" if next_elem and next_elem.strip(): stat_text = next_elem.strip() else: # Try to find text in the next sibling element (e.g., ) next_tag = svg.find_next_sibling() if next_tag: stat_text = next_tag.get_text(strip=True) # Skip non-numeric text like "Viewer", "Updated", etc. if not stat_text or len(stat_text) < 1 or stat_text in ['Viewer', 'Updated']: continue # Identify icon type by viewBox or path content svg_str = str(svg) # Download icon: viewBox="0 0 32 32" with download arrow path if 'M26 24v4H6v-4H4v4a2 2 0 0 0 2 2h20a2 2 0 0 0 2-2v-4zm0-10l-1.41-1.41L17 20.17V2h-2v18.17l-7.59-7.58L6 14l10 10l10-10z' in svg_str: downloads = parse_stat_number(stat_text) # Like/heart icon: heart path elif 'M22.45,6a5.47,5.47,0,0,1,3.91,1.64,5.7,5.7,0,0,1,0,8L16,26.13' in svg_str: likes = parse_stat_number(stat_text) # Dataset size icon: table/grid icon with fill-rule="evenodd" elif 'fill-rule="evenodd"' in svg_str and 'clip-rule="evenodd"' in svg_str: # Dataset size (e.g., "411k", "23.4M", "65.1k") # Must look like a number (has k, M, or digits) if any(c in stat_text for c in ['k', 'K', 'm', 'M']) or stat_text.replace(',', '').replace('.', '').isdigit(): size = stat_text datasets.append({ 'name': dataset_name, 'url': f"https://huggingface.co/datasets/{dataset_name}", 'downloads': downloads, 'likes': likes, 'size': size }) except Exception as e: logs.append(f" Error parsing dataset card: {e}") continue except Exception as e: logs.append(f" ERROR searching page {page}: {e}") break # Sort by downloads (descending) datasets.sort(key=lambda x: x['downloads'], reverse=True) logs.append(f"Total unique datasets found: {len(datasets)}") return datasets, logs def search_language_resources(language_code, deduplicate=False): """ Search for ASR/TTS resources for a given language Returns results organized by service type deduplicate: if True, remove duplicate models (same base name) and keep only the one with most downloads """ all_logs = [] if not language_code: return None, None, None, 0, 0, None, None, 0, 0, "" lang_info = LANGUAGES.get(language_code) if not lang_info: return None, None, None, 0, 0, None, None, 0, 0, "" language_name = lang_info['name'] all_logs.append(f"=== Searching for {language_name} ({language_code}) ===") all_logs.append(f"Language codes: alpha2={lang_info['alpha2']}, alpha3_b={lang_info['alpha3_b']}, alpha3_t={lang_info['alpha3_t']}") # Fetch Azure data all_logs.append("\n[Azure Speech Services]") azure_asr = fetch_azure_asr_languages() azure_tts = fetch_azure_tts_languages() all_logs.append(f" Fetched {len(azure_asr)} ASR languages and {len(azure_tts)} TTS languages from Azure") # Get matching Azure locales using alpha2 code azure_locales = get_azure_locales_for_language(language_code) all_logs.append(f" Matching Azure locales: {azure_locales}") # Check Azure ASR support azure_asr_locales = [loc for loc in azure_locales if loc in azure_asr] azure_asr_available = len(azure_asr_locales) > 0 all_logs.append(f" Azure ASR: {'✅ Supported' if azure_asr_available else '❌ Not supported'} ({len(azure_asr_locales)} locales)") # Check Azure TTS support and count voices azure_tts_locales = [loc for loc in azure_locales if loc in azure_tts] azure_tts_available = len(azure_tts_locales) > 0 azure_total_voices = sum(azure_tts[loc]['voice_count'] for loc in azure_tts_locales) all_logs.append(f" Azure TTS: {'✅ Supported' if azure_tts_available else '❌ Not supported'} ({len(azure_tts_locales)} locales, {azure_total_voices} voices)") # Fetch Google Cloud data all_logs.append("\n[Google Cloud Speech]") google_stt = fetch_google_stt_languages() google_tts = fetch_google_tts_languages() all_logs.append(f" Fetched {len(google_stt)} STT languages and {len(google_tts)} TTS languages from Google Cloud") # Get matching Google Cloud locales using alpha2 code google_locales = get_google_locales_for_language(language_code) all_logs.append(f" Matching Google Cloud locales: {google_locales}") # Check Google Cloud STT support google_stt_locales = [loc for loc in google_locales if loc in google_stt] google_stt_available = len(google_stt_locales) > 0 all_logs.append(f" Google STT: {'✅ Supported' if google_stt_available else '❌ Not supported'} ({len(google_stt_locales)} locales)") # Check Google Cloud TTS support and count voices google_tts_locales = [loc for loc in google_locales if loc in google_tts] google_tts_available = len(google_tts_locales) > 0 google_total_voices = sum(google_tts[loc]['voice_count'] for loc in google_tts_locales) all_logs.append(f" Google TTS: {'✅ Supported' if google_tts_available else '❌ Not supported'} ({len(google_tts_locales)} locales, {google_total_voices} voices)") # Fetch AWS data all_logs.append("\n[AWS (Transcribe + Polly)]") aws_transcribe = fetch_aws_transcribe_languages() aws_polly = fetch_aws_polly_languages() all_logs.append(f" Fetched {len(aws_transcribe)} Transcribe languages and {len(aws_polly)} Polly languages from AWS") # Get matching AWS locales using alpha2 code aws_locales = get_aws_locales_for_language(language_code) all_logs.append(f" Matching AWS locales: {aws_locales}") # Check AWS Transcribe support aws_transcribe_locales = [loc for loc in aws_locales if loc in aws_transcribe] aws_transcribe_available = len(aws_transcribe_locales) > 0 all_logs.append(f" AWS Transcribe: {'✅ Supported' if aws_transcribe_available else '❌ Not supported'} ({len(aws_transcribe_locales)} locales)") # Check AWS Polly support and count voices aws_polly_locales = [loc for loc in aws_locales if loc in aws_polly] aws_polly_available = len(aws_polly_locales) > 0 aws_total_voices = sum(aws_polly[loc]['voice_count'] for loc in aws_polly_locales) all_logs.append(f" AWS Polly: {'✅ Supported' if aws_polly_available else '❌ Not supported'} ({len(aws_polly_locales)} locales, {aws_total_voices} voices)") # Commercial Services commercial_rows = [] # Azure Speech if azure_asr_available: azure_asr_text = f"✅ {len(azure_asr_locales)} locale(s)" else: azure_asr_text = "❌ N/A" if azure_tts_available: azure_tts_text = f"✅ {len(azure_tts_locales)} locale(s), {azure_total_voices} voice(s)" else: azure_tts_text = "❌ N/A" commercial_rows.append({ "Service": "Azure Speech", "ASR": azure_asr_text, "TTS": azure_tts_text, }) # Google Cloud Speech if google_stt_available: google_stt_text = f"✅ {len(google_stt_locales)} locale(s)" else: google_stt_text = "❌ N/A" if google_tts_available: google_tts_text = f"✅ {len(google_tts_locales)} locale(s), {google_total_voices} voice(s)" else: google_tts_text = "❌ N/A" commercial_rows.append({ "Service": "Google Cloud Speech", "ASR": google_stt_text, "TTS": google_tts_text, }) # AWS (Transcribe + Polly) if aws_transcribe_available: aws_transcribe_text = f"✅ {len(aws_transcribe_locales)} locale(s)" else: aws_transcribe_text = "❌ N/A" if aws_polly_available: aws_polly_text = f"✅ {len(aws_polly_locales)} locale(s), {aws_total_voices} voice(s)" else: aws_polly_text = "❌ N/A" commercial_rows.append({ "Service": "AWS (Transcribe + Polly)", "ASR": aws_transcribe_text, "TTS": aws_polly_text, }) # ElevenLabs Multilingual v2 (TTS only) all_logs.append("\n[ElevenLabs]") elevenlabs_v2_supported = check_elevenlabs_multilingual_v2_support(language_code) all_logs.append(f" Multilingual v2: {'✅ Supported' if elevenlabs_v2_supported else '❌ Not supported'}") if elevenlabs_v2_supported: elevenlabs_v2_tts_text = "✅ Supported" else: elevenlabs_v2_tts_text = "❌ N/A" commercial_rows.append({ "Service": "ElevenLabs Multilingual v2", "ASR": "N/A", # ElevenLabs doesn't offer ASR "TTS": elevenlabs_v2_tts_text, }) # ElevenLabs Turbo v3 (TTS only) elevenlabs_v3_supported = check_elevenlabs_turbo_v3_support(language_code) all_logs.append(f" Turbo v3: {'✅ Supported' if elevenlabs_v3_supported else '❌ Not supported'}") if elevenlabs_v3_supported: elevenlabs_v3_tts_text = "✅ Supported" else: elevenlabs_v3_tts_text = "❌ N/A" commercial_rows.append({ "Service": "ElevenLabs Turbo v3", "ASR": "N/A", # ElevenLabs doesn't offer ASR "TTS": elevenlabs_v3_tts_text, }) commercial_df = pd.DataFrame(commercial_rows) # HuggingFace Models - Search for real ASR and TTS models all_logs.append("\n[HuggingFace Models]") asr_models, asr_model_logs = search_huggingface_models(language_code, 'automatic-speech-recognition', max_results=100, max_pages=5) all_logs.extend([f" [ASR] {log}" for log in asr_model_logs]) tts_models, tts_model_logs = search_huggingface_models(language_code, 'text-to-speech', max_results=100, max_pages=5) all_logs.extend([f" [TTS] {log}" for log in tts_model_logs]) # Apply deduplication if requested if deduplicate: all_logs.append(f"\n[Deduplication]") asr_before = len(asr_models) asr_models = deduplicate_models(asr_models) all_logs.append(f" ASR models: {asr_before} → {len(asr_models)} (removed {asr_before - len(asr_models)} duplicates)") tts_before = len(tts_models) tts_models = deduplicate_models(tts_models) all_logs.append(f" TTS models: {tts_before} → {len(tts_models)} (removed {tts_before - len(tts_models)} duplicates)") else: # Add duplicates count of 1 for all models when not deduplicating for model in asr_models: model['duplicates'] = 1 for model in tts_models: model['duplicates'] = 1 # Format ASR models with clickable names asr_models_data = [] for model in asr_models: asr_models_data.append({ "Model Name": f"[{model['name']}]({model['url']})", "Downloads": model['downloads'], "Likes": model['likes'], "Size": model.get('size', ''), "Duplicates": model.get('duplicates', 1) }) if asr_models_data: asr_models_df = pd.DataFrame(asr_models_data) else: # Empty dataframe if no models found asr_models_df = pd.DataFrame(columns=["Model Name", "Downloads", "Likes", "Size", "Duplicates"]) # Format TTS models with clickable names tts_models_data = [] for model in tts_models: tts_models_data.append({ "Model Name": f"[{model['name']}]({model['url']})", "Downloads": model['downloads'], "Likes": model['likes'], "Size": model.get('size', ''), "Duplicates": model.get('duplicates', 1) }) if tts_models_data: tts_models_df = pd.DataFrame(tts_models_data) else: # Empty dataframe if no models found tts_models_df = pd.DataFrame(columns=["Model Name", "Downloads", "Likes", "Size", "Duplicates"]) # HuggingFace Datasets - Search for real ASR and TTS datasets all_logs.append("\n[HuggingFace Datasets]") asr_datasets, asr_dataset_logs = search_huggingface_datasets(language_code, 'automatic-speech-recognition', max_results=100, max_pages=5) all_logs.extend([f" [ASR] {log}" for log in asr_dataset_logs]) tts_datasets, tts_dataset_logs = search_huggingface_datasets(language_code, 'text-to-speech', max_results=100, max_pages=5) all_logs.extend([f" [TTS] {log}" for log in tts_dataset_logs]) # Format ASR datasets with clickable names asr_datasets_data = [] for dataset in asr_datasets: asr_datasets_data.append({ "Dataset Name": f"[{dataset['name']}]({dataset['url']})", "Downloads": dataset['downloads'], "Likes": dataset['likes'], "Size": dataset.get('size', '') }) if asr_datasets_data: asr_datasets_df = pd.DataFrame(asr_datasets_data) else: # Empty dataframe if no datasets found asr_datasets_df = pd.DataFrame(columns=["Dataset Name", "Downloads", "Likes", "Size"]) # Format TTS datasets with clickable names tts_datasets_data = [] for dataset in tts_datasets: tts_datasets_data.append({ "Dataset Name": f"[{dataset['name']}]({dataset['url']})", "Downloads": dataset['downloads'], "Likes": dataset['likes'], "Size": dataset.get('size', '') }) if tts_datasets_data: tts_datasets_df = pd.DataFrame(tts_datasets_data) else: # Empty dataframe if no datasets found tts_datasets_df = pd.DataFrame(columns=["Dataset Name", "Downloads", "Likes", "Size"]) # Combine all logs log_text = "\n".join(all_logs) # Return separate ASR and TTS dataframes, plus counts for tab labels, plus logs return commercial_df, asr_models_df, tts_models_df, len(asr_models), len(tts_models), asr_datasets_df, tts_datasets_df, len(asr_datasets), len(tts_datasets), log_text # Initialize - load language list and app content print("Initializing Speech Resource Finder...") load_app_content() load_language_list() load_language_taxonomy() # Create language choices for dropdown (code: name format for easy searching) language_choices = [f"{code}: {info['name']}" for code, info in sorted(LANGUAGES.items(), key=lambda x: x[1]['name'])] print(f"Created dropdown with {len(language_choices)} language options") with gr.Blocks(title=APP_CONTENT["title"]) as demo: gr.Markdown(f"# {APP_CONTENT['title']}") gr.Markdown(APP_CONTENT["description"]) with gr.Row(equal_height=True): with gr.Column(scale=2): language_dropdown = gr.Dropdown( choices=language_choices, label="Select Language", info="Type to search for a language", allow_custom_value=False, filterable=True, ) with gr.Column(scale=1): language_metadata = gr.HTML( """

Select a language to see resource classification

""", elem_id="language-metadata" ) gr.Markdown("## Commercial Services") commercial_table = gr.Dataframe( headers=["Service", "ASR", "TTS"], interactive=False, wrap=True, ) gr.Markdown("## HuggingFace Models") with gr.Row(): deduplicate_checkbox = gr.Checkbox( label="Deduplicate models", value=True, info="Keep only the model with most downloads for each base name" ) # Create tabs for ASR and TTS models with count labels with gr.Tabs(): with gr.Tab(label="ASR Models") as asr_tab: asr_count_label = gr.Markdown("*Loading...*") asr_models_table = gr.Dataframe( headers=["Model Name", "Downloads", "Likes", "Size", "Duplicates"], interactive=False, wrap=True, datatype=["markdown", "number", "number", "str", "number"], ) with gr.Tab(label="TTS Models") as tts_tab: tts_count_label = gr.Markdown("*Loading...*") tts_models_table = gr.Dataframe( headers=["Model Name", "Downloads", "Likes", "Size", "Duplicates"], interactive=False, wrap=True, datatype=["markdown", "number", "number", "str", "number"], ) gr.Markdown("## HuggingFace Datasets") # Create tabs for ASR and TTS datasets with count labels with gr.Tabs(): with gr.Tab(label="ASR Datasets") as asr_datasets_tab: asr_datasets_count_label = gr.Markdown("*Loading...*") asr_datasets_table = gr.Dataframe( headers=["Dataset Name", "Downloads", "Likes", "Size"], interactive=False, wrap=True, datatype=["markdown", "number", "number", "str"], ) with gr.Tab(label="TTS Datasets") as tts_datasets_tab: tts_datasets_count_label = gr.Markdown("*Loading...*") tts_datasets_table = gr.Dataframe( headers=["Dataset Name", "Downloads", "Likes", "Size"], interactive=False, wrap=True, datatype=["markdown", "number", "number", "str"], ) with gr.Accordion("Logs", open=False): log_textbox = gr.Textbox( show_label=False, lines=15, max_lines=30, interactive=False, placeholder="Logs will appear here...", autoscroll=True, ) # About section with full content with gr.Accordion("About this tool", open=False): gr.Markdown(APP_CONTENT["full_content"]) def on_search(language_selection, deduplicate): if not language_selection: default_html = """

Select a language to see resource classification

""" return default_html, None, "", None, "", None, "", None, "", None, "" # Extract the language code from "code: name" format language_code = language_selection.split(":")[0].strip() # Get language name for taxonomy lookup language_name = LANGUAGES.get(language_code, {}).get("name", "") # Get taxonomy classification level, classification = get_language_taxonomy_info(language_name) # Create metadata display with color coding if level is not None: color = get_taxonomy_color(level) metadata_html = f"""

{language_name}

{classification}

Source: Joshi et al.

""" else: metadata_html = f"""

{language_name}

Unknown

Source: Joshi et al.

""" commercial_df, asr_models_df, tts_models_df, asr_models_count, tts_models_count, asr_datasets_df, tts_datasets_df, asr_datasets_count, tts_datasets_count, logs = search_language_resources(language_code, deduplicate=deduplicate) # Create count labels asr_models_label = f"**Found {asr_models_count} ASR model(s)**" tts_models_label = f"**Found {tts_models_count} TTS model(s)**" asr_datasets_label = f"**Found {asr_datasets_count} ASR dataset(s)**" tts_datasets_label = f"**Found {tts_datasets_count} TTS dataset(s)**" return metadata_html, commercial_df, asr_models_label, asr_models_df, tts_models_label, tts_models_df, asr_datasets_label, asr_datasets_df, tts_datasets_label, tts_datasets_df, logs # Trigger search when language is selected language_dropdown.change( fn=on_search, inputs=[language_dropdown, deduplicate_checkbox], outputs=[language_metadata, commercial_table, asr_count_label, asr_models_table, tts_count_label, tts_models_table, asr_datasets_count_label, asr_datasets_table, tts_datasets_count_label, tts_datasets_table, log_textbox], ) # Trigger search when deduplicate checkbox is changed deduplicate_checkbox.change( fn=on_search, inputs=[language_dropdown, deduplicate_checkbox], outputs=[language_metadata, commercial_table, asr_count_label, asr_models_table, tts_count_label, tts_models_table, asr_datasets_count_label, asr_datasets_table, tts_datasets_count_label, tts_datasets_table, log_textbox], ) if __name__ == "__main__": demo.launch(server_name="0.0.0.0", server_port=7860, share=False, show_error=True)