Alp
logs scroll fix
1fd7b36
import gradio as gr
import pandas as pd
import requests
from bs4 import BeautifulSoup
from functools import lru_cache
import csv
from io import StringIO
import re
# Configuration
LANGUAGE_CODES_FILE = "language-codes-full.csv"
APP_CONTENT_FILE = "app_content.md"
LANGUAGE_TAXONOMY_URL = "https://microsoft.github.io/linguisticdiversity/assets/lang2tax.txt"
# Language list will be loaded from CSV
# Structure: {alpha3_b: {"name": str, "alpha3_t": str, "alpha2": str}}
LANGUAGES = {}
# Language taxonomy mapping (from Joshi et al.'s linguistic diversity paper)
# Structure: {language_name_lowercase: level}
LANGUAGE_TAXONOMY = {}
# Taxonomy level descriptions
TAXONOMY_LEVELS = {
0: "The Left-Behinds",
1: "The Scraping-Bys",
2: "The Hopefuls",
3: "The Rising Stars",
4: "The Underdogs",
5: "The Winners"
}
# App content will be loaded from markdown file
APP_CONTENT = {
"title": "Speech Resource Finder",
"description": "Search for speech resources",
"full_content": ""
}
def load_app_content(content_path=None):
"""Load app content from markdown file"""
global APP_CONTENT
if content_path is None:
content_path = APP_CONTENT_FILE
try:
with open(content_path, 'r', encoding='utf-8') as f:
content = f.read()
# Parse markdown content
lines = content.split('\n')
# Extract title (first # heading)
title = "Speech Resource Finder"
for line in lines:
if line.startswith('# '):
title = line[2:].strip()
break
# Extract description (text after ## Description until next ##)
description = ""
in_description = False
for line in lines:
if line.startswith('## Description'):
in_description = True
continue
elif in_description and line.startswith('##'):
break
elif in_description and line.strip():
description += line.strip() + " "
APP_CONTENT = {
"title": title,
"description": description.strip(),
"full_content": content
}
print(f"Loaded app content from {content_path}")
except Exception as e:
print(f"Error loading app content: {e}")
print("Using default content")
def load_language_list(csv_path=None):
"""Load ISO 639 language codes from CSV file"""
global LANGUAGES
if csv_path is None:
csv_path = LANGUAGE_CODES_FILE
try:
with open(csv_path, 'r', encoding='utf-8') as f:
reader = csv.DictReader(f)
for row in reader:
# Use alpha3-b as primary key, fallback to alpha3-t if empty
code_b = row['alpha3-b'].strip()
code_t = row['alpha3-t'].strip()
code_2 = row['alpha2'].strip()
name = row['English'].strip()
primary_code = code_b if code_b else code_t
if primary_code and name:
LANGUAGES[primary_code] = {
"name": name,
"alpha3_b": code_b,
"alpha3_t": code_t,
"alpha2": code_2
}
print(f"Loaded {len(LANGUAGES)} languages from {csv_path}")
except Exception as e:
print(f"Error loading language list: {e}")
# Fallback to a minimal set
LANGUAGES = {
"eng": {"name": "English", "alpha3_b": "eng", "alpha3_t": "", "alpha2": "en"},
"spa": {"name": "Spanish", "alpha3_b": "spa", "alpha3_t": "", "alpha2": "es"},
"fra": {"name": "French", "alpha3_b": "fra", "alpha3_t": "", "alpha2": "fr"},
"deu": {"name": "German", "alpha3_b": "ger", "alpha3_t": "deu", "alpha2": "de"},
}
print(f"Using fallback with {len(LANGUAGES)} languages")
def load_language_taxonomy():
"""Load language taxonomy data from Microsoft's linguistic diversity project"""
global LANGUAGE_TAXONOMY
try:
response = requests.get(LANGUAGE_TAXONOMY_URL, timeout=10)
response.raise_for_status()
# Parse the CSV-like content (format: language_name,level)
for line in response.text.strip().split('\n'):
if line.strip():
parts = line.strip().split(',')
if len(parts) == 2:
lang_name = parts[0].strip().lower()
level = int(parts[1].strip())
LANGUAGE_TAXONOMY[lang_name] = level
print(f"Loaded taxonomy data for {len(LANGUAGE_TAXONOMY)} languages")
except Exception as e:
print(f"Warning: Could not load language taxonomy: {e}")
print("Language classification will show as 'Unknown'")
def get_taxonomy_color(level):
"""
Get color code for taxonomy level (red for left-behind, green for winners)
"""
colors = {
0: "#d32f2f", # Red - The Left-Behinds
1: "#f57c00", # Orange - The Scraping-Bys
2: "#fbc02d", # Yellow - The Hopefuls
3: "#afb42b", # Yellow-green - The Rising Stars
4: "#7cb342", # Light green - The Underdogs
5: "#388e3c", # Green - The Winners
}
return colors.get(level, "#757575") # Gray for unknown
def get_language_taxonomy_info(language_name):
"""
Get taxonomy classification for a language.
Returns a tuple of (level, description) or (None, "Unknown")
"""
if not language_name:
return None, "Unknown"
# Try exact match (case-insensitive)
lang_lower = language_name.lower()
if lang_lower in LANGUAGE_TAXONOMY:
level = LANGUAGE_TAXONOMY[lang_lower]
return level, TAXONOMY_LEVELS.get(level, f"Level {level}")
# Try with semicolon-separated alternative names (e.g., "Catalan; Valencian")
if ';' in lang_lower:
parts = [p.strip() for p in lang_lower.split(';')]
for part in parts:
if part in LANGUAGE_TAXONOMY:
level = LANGUAGE_TAXONOMY[part]
return level, TAXONOMY_LEVELS.get(level, f"Level {level}")
# Try with comma-separated variations (e.g., "Chinese, Mandarin")
if ',' in lang_lower:
parts = [p.strip() for p in lang_lower.split(',')]
for part in parts:
if part in LANGUAGE_TAXONOMY:
level = LANGUAGE_TAXONOMY[part]
return level, TAXONOMY_LEVELS.get(level, f"Level {level}")
return None, "Unknown"
@lru_cache(maxsize=1)
def fetch_azure_asr_languages():
"""Scrape Azure Speech-to-Text supported languages"""
url = "https://learn.microsoft.com/en-us/azure/ai-services/speech-service/language-support?tabs=stt"
try:
response = requests.get(url, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
# Find the table with locale data
# The table has columns: Locale (BCP-47) | Language | Fast transcription support | Custom speech support
tables = soup.find_all('table')
azure_asr = {}
for table in tables:
rows = table.find_all('tr')
if not rows:
continue
# Check if this is the right table by looking at headers
headers = [th.get_text(strip=True) for th in rows[0].find_all('th')]
if 'Locale' in ' '.join(headers) or 'Language' in ' '.join(headers):
for row in rows[1:]: # Skip header
cols = row.find_all('td')
if len(cols) >= 2:
locale = cols[0].get_text(strip=True)
language = cols[1].get_text(strip=True)
if locale and language:
azure_asr[locale] = language
break
return azure_asr
except Exception as e:
print(f"Error fetching Azure ASR data: {e}")
return {}
@lru_cache(maxsize=1)
def fetch_azure_tts_languages():
"""Scrape Azure Text-to-Speech supported languages with voice counts"""
url = "https://learn.microsoft.com/en-us/azure/ai-services/speech-service/language-support?tabs=tts"
try:
response = requests.get(url, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
# Find the TTS table
# Columns: Locale (BCP-47) | Language | Text to speech voices
tables = soup.find_all('table')
azure_tts = {}
for table in tables:
rows = table.find_all('tr')
if not rows:
continue
headers = [th.get_text(strip=True) for th in rows[0].find_all('th')]
if 'Text to speech' in ' '.join(headers) or 'voices' in ' '.join(headers).lower():
for row in rows[1:]:
cols = row.find_all('td')
if len(cols) >= 3:
locale = cols[0].get_text(strip=True)
language = cols[1].get_text(strip=True)
voices_text = cols[2].get_text(strip=True)
# Count number of voices (look for "Neural" in the text)
voice_count = voices_text.count('Neural')
if locale and language:
azure_tts[locale] = {
'language': language,
'voice_count': voice_count
}
break
return azure_tts
except Exception as e:
print(f"Error fetching Azure TTS data: {e}")
return {}
@lru_cache(maxsize=1)
def fetch_google_stt_languages():
"""Scrape Google Cloud Speech-to-Text supported languages"""
url = "https://cloud.google.com/speech-to-text/docs/speech-to-text-supported-languages"
try:
response = requests.get(url, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
# Find tables with BCP-47 language codes
tables = soup.find_all('table')
google_stt = {}
for table in tables:
rows = table.find_all('tr')
if not rows:
continue
# Check if this table has BCP-47 column
headers = [th.get_text(strip=True) for th in rows[0].find_all('th')]
# Find BCP-47 column index
bcp47_idx = None
name_idx = None
for idx, header in enumerate(headers):
if 'BCP-47' in header or 'BCP47' in header:
bcp47_idx = idx
if 'Name' in header and name_idx is None:
name_idx = idx
if bcp47_idx is not None:
for row in rows[1:]: # Skip header
cols = row.find_all('td')
if len(cols) > bcp47_idx:
locale = cols[bcp47_idx].get_text(strip=True)
language = cols[name_idx].get_text(strip=True) if name_idx and len(cols) > name_idx else ''
if locale and locale not in ['—', '-', '']:
google_stt[locale] = language
return google_stt
except Exception as e:
print(f"Error fetching Google STT data: {e}")
return {}
@lru_cache(maxsize=1)
def fetch_google_tts_languages():
"""Scrape Google Cloud Text-to-Speech supported languages with voice counts"""
url = "https://cloud.google.com/text-to-speech/docs/voices"
try:
response = requests.get(url, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
# Find the voices table
# Columns: Language | Voice type | Language code | Voice name | SSML Gender | Sample
tables = soup.find_all('table')
google_tts = {}
for table in tables:
rows = table.find_all('tr')
if not rows:
continue
headers = [th.get_text(strip=True) for th in rows[0].find_all('th')]
# Find Language code column index
lang_code_idx = None
for idx, header in enumerate(headers):
if 'Language code' in header or 'language code' in header.lower():
lang_code_idx = idx
break
if lang_code_idx is not None:
for row in rows[1:]:
cols = row.find_all('td')
if len(cols) > lang_code_idx:
locale = cols[lang_code_idx].get_text(strip=True)
if locale and locale not in ['—', '-', '']:
# Count voices per locale
if locale in google_tts:
google_tts[locale]['voice_count'] += 1
else:
language = cols[0].get_text(strip=True) if len(cols) > 0 else ''
google_tts[locale] = {
'language': language,
'voice_count': 1
}
return google_tts
except Exception as e:
print(f"Error fetching Google TTS data: {e}")
return {}
@lru_cache(maxsize=1)
def fetch_elevenlabs_multilingual_v2():
"""Get ElevenLabs Multilingual v2 supported languages"""
# Based on https://elevenlabs.io/docs/models#multilingual-v2
# These are ISO 639-1 (2-letter) codes
supported_codes = {
'en', 'ja', 'zh', 'de', 'hi', 'fr', 'ko', 'pt', 'it', 'es',
'id', 'nl', 'tr', 'fil', 'pl', 'sv', 'bg', 'ro', 'ar', 'cs',
'el', 'fi', 'hr', 'ms', 'sk', 'da', 'ta', 'uk', 'ru'
}
return supported_codes
@lru_cache(maxsize=1)
def fetch_elevenlabs_turbo_v3():
"""Get ElevenLabs Eleven Turbo v3 (formerly v3 Alpha) supported languages"""
# Based on https://elevenlabs.io/docs/models#eleven-v3-alpha
# These are ISO 639-3 (3-letter) codes
supported_codes = {
'afr', 'ara', 'hye', 'asm', 'aze', 'bel', 'ben', 'bos', 'bul', 'cat',
'ceb', 'nya', 'hrv', 'ces', 'dan', 'nld', 'eng', 'est', 'fil', 'fin',
'fra', 'glg', 'kat', 'deu', 'ell', 'guj', 'hau', 'heb', 'hin', 'hun',
'isl', 'ind', 'gle', 'ita', 'jpn', 'jav', 'kan', 'kaz', 'kir', 'kor',
'lav', 'lin', 'lit', 'ltz', 'mkd', 'msa', 'mal', 'cmn', 'mar', 'nep',
'nor', 'pus', 'fas', 'pol', 'por', 'pan', 'ron', 'rus', 'srp', 'snd',
'slk', 'slv', 'som', 'spa', 'swa', 'swe', 'tam', 'tel', 'tha', 'tur',
'ukr', 'urd', 'vie', 'cym'
}
return supported_codes
@lru_cache(maxsize=1)
def fetch_aws_transcribe_languages():
"""Scrape AWS Transcribe (ASR) supported languages"""
url = "https://docs.aws.amazon.com/transcribe/latest/dg/supported-languages.html"
try:
response = requests.get(url, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
# Find tables with language codes
tables = soup.find_all('table')
aws_transcribe = {}
for table in tables:
rows = table.find_all('tr')
if not rows:
continue
# Check if this table has language code column
headers = [th.get_text(strip=True) for th in rows[0].find_all('th')]
# Find language code column index
lang_code_idx = None
lang_name_idx = None
for idx, header in enumerate(headers):
if 'Language code' in header or 'language code' in header.lower():
lang_code_idx = idx
if 'Language' == header or header.startswith('Language'):
lang_name_idx = idx
if lang_code_idx is not None:
for row in rows[1:]: # Skip header
cols = row.find_all('td')
if len(cols) > lang_code_idx:
locale = cols[lang_code_idx].get_text(strip=True)
language = cols[lang_name_idx].get_text(strip=True) if lang_name_idx and len(cols) > lang_name_idx else ''
if locale and locale not in ['—', '-', '']:
aws_transcribe[locale] = language
return aws_transcribe
except Exception as e:
print(f"Error fetching AWS Transcribe data: {e}")
return {}
@lru_cache(maxsize=1)
def fetch_aws_polly_languages():
"""Scrape AWS Polly (TTS) supported languages"""
url = "https://docs.aws.amazon.com/polly/latest/dg/supported-languages.html"
try:
response = requests.get(url, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
# Find tables with language codes
tables = soup.find_all('table')
aws_polly = {}
for table in tables:
rows = table.find_all('tr')
if not rows:
continue
# Check if this table has language code column
headers = [th.get_text(strip=True) for th in rows[0].find_all('th')]
# Find language code column index
lang_code_idx = None
lang_name_idx = None
for idx, header in enumerate(headers):
if 'Language code' in header or 'language code' in header.lower():
lang_code_idx = idx
if 'Language' == header or header.startswith('Language'):
lang_name_idx = idx
if lang_code_idx is not None:
for row in rows[1:]: # Skip header
cols = row.find_all('td')
if len(cols) > lang_code_idx:
locale = cols[lang_code_idx].get_text(strip=True)
language = cols[lang_name_idx].get_text(strip=True) if lang_name_idx and len(cols) > lang_name_idx else ''
if locale and locale not in ['—', '-', '']:
# Count voices per locale (each row is a different voice/locale combo)
if locale in aws_polly:
aws_polly[locale]['voice_count'] += 1
else:
aws_polly[locale] = {
'language': language,
'voice_count': 1
}
return aws_polly
except Exception as e:
print(f"Error fetching AWS Polly data: {e}")
return {}
def get_azure_locales_for_language(language_code):
"""
Get Azure BCP-47 locales for a language using its alpha2 code
Returns list of matching locales from Azure
"""
lang_info = LANGUAGES.get(language_code)
if not lang_info or not lang_info['alpha2']:
return []
alpha2 = lang_info['alpha2']
azure_asr = fetch_azure_asr_languages()
azure_tts = fetch_azure_tts_languages()
# Find all locales that start with the alpha2 code
matching_locales = set()
for locale in azure_asr.keys():
if locale.startswith(alpha2 + '-') or locale == alpha2:
matching_locales.add(locale)
for locale in azure_tts.keys():
if locale.startswith(alpha2 + '-') or locale == alpha2:
matching_locales.add(locale)
return sorted(matching_locales)
def get_google_locales_for_language(language_code):
"""
Get Google Cloud BCP-47 locales for a language using its alpha2 code
Returns list of matching locales from Google Cloud
"""
lang_info = LANGUAGES.get(language_code)
if not lang_info or not lang_info['alpha2']:
return []
alpha2 = lang_info['alpha2']
google_stt = fetch_google_stt_languages()
google_tts = fetch_google_tts_languages()
# Find all locales that start with the alpha2 code
matching_locales = set()
for locale in google_stt.keys():
if locale.startswith(alpha2 + '-') or locale == alpha2:
matching_locales.add(locale)
for locale in google_tts.keys():
if locale.startswith(alpha2 + '-') or locale == alpha2:
matching_locales.add(locale)
return sorted(matching_locales)
def check_elevenlabs_multilingual_v2_support(language_code):
"""
Check if ElevenLabs Multilingual v2 supports a language using ISO 639-1 (alpha2) codes
Returns True if supported, False otherwise
"""
lang_info = LANGUAGES.get(language_code)
if not lang_info:
return False
supported_codes = fetch_elevenlabs_multilingual_v2()
# Check alpha2 code (2-letter code)
if lang_info['alpha2'] and lang_info['alpha2'] in supported_codes:
return True
return False
def check_elevenlabs_turbo_v3_support(language_code):
"""
Check if ElevenLabs Turbo v3 supports a language using ISO 639-3 (alpha3) codes
Returns True if supported, False otherwise
"""
lang_info = LANGUAGES.get(language_code)
if not lang_info:
return False
supported_codes = fetch_elevenlabs_turbo_v3()
# Check alpha3_b code first (3-letter code, bibliographic)
if lang_info['alpha3_b'] and lang_info['alpha3_b'] in supported_codes:
return True
# Check alpha3_t code (3-letter code, terminological)
if lang_info['alpha3_t'] and lang_info['alpha3_t'] in supported_codes:
return True
return False
def get_aws_locales_for_language(language_code):
"""
Get AWS locales for a language using its alpha2 code
Returns list of matching locales from AWS Transcribe and Polly
"""
lang_info = LANGUAGES.get(language_code)
if not lang_info or not lang_info['alpha2']:
return []
alpha2 = lang_info['alpha2']
aws_transcribe = fetch_aws_transcribe_languages()
aws_polly = fetch_aws_polly_languages()
# Find all locales that start with the alpha2 code
matching_locales = set()
for locale in aws_transcribe.keys():
if locale.startswith(alpha2 + '-') or locale == alpha2:
matching_locales.add(locale)
for locale in aws_polly.keys():
if locale.startswith(alpha2 + '-') or locale == alpha2:
matching_locales.add(locale)
return sorted(matching_locales)
def search_huggingface_models(language_code, pipeline_tag, max_results=100, max_pages=3):
"""
Search HuggingFace for models supporting a specific language
pipeline_tag: 'automatic-speech-recognition' or 'text-to-speech'
max_results: maximum number of models to return
max_pages: maximum number of pages to search per language code
Returns tuple: (list of model dictionaries, log messages)
"""
lang_info = LANGUAGES.get(language_code)
logs = []
if not lang_info:
logs.append(f"No language info found for code: {language_code}")
return [], logs
# Try multiple language code formats
codes_to_try = []
if lang_info['alpha2']:
codes_to_try.append(lang_info['alpha2']) # 2-letter code
if lang_info['alpha3_b']:
codes_to_try.append(lang_info['alpha3_b']) # 3-letter code
if lang_info['alpha3_t']:
codes_to_try.append(lang_info['alpha3_t']) # 3-letter terminological
logs.append(f"Language codes to search: {set(codes_to_try)}")
models = []
seen_models = set()
for code in codes_to_try:
if len(models) >= max_results:
break
logs.append(f"Searching for language code: {code}")
# Try multiple pages for this language code
for page in range(max_pages):
if len(models) >= max_results:
break
try:
# Use HuggingFace model search with pagination
url = f"https://huggingface.co/models?pipeline_tag={pipeline_tag}&language={code}&sort=trending"
if page > 0:
url += f"&p={page}"
logs.append(f" Page {page}: {url}")
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
}
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
# Parse model cards from the page
model_cards = soup.find_all('article', class_='overview-card-wrapper')
if not model_cards:
logs.append(f" No model cards found on page {page}")
break
logs.append(f" Found {len(model_cards)} model cards on page {page}")
for card in model_cards:
if len(models) >= max_results:
break
try:
link = card.find('a', href=True)
if link:
href = link.get('href', '')
model_name = href.lstrip('/')
if model_name and model_name != '#' and model_name not in seen_models:
seen_models.add(model_name)
# Parse stats directly from the card HTML by looking at SVG icons
downloads = 0
likes = 0
size = ""
# Find all SVG elements in the card
svgs = card.find_all('svg')
for svg in svgs:
# Get the next sibling text after the SVG
# Could be direct text or text within a span/other element
next_elem = svg.find_next_sibling(string=True)
stat_text = ""
if next_elem and next_elem.strip():
stat_text = next_elem.strip()
else:
# Try to find text in the next sibling element (e.g., <span>)
next_tag = svg.find_next_sibling()
if next_tag:
stat_text = next_tag.get_text(strip=True)
if not stat_text or len(stat_text) < 1:
continue
# Identify icon type by viewBox or path content
svg_str = str(svg)
# Download icon: viewBox="0 0 32 32" with download arrow path
if 'M26 24v4H6v-4H4v4a2 2 0 0 0 2 2h20a2 2 0 0 0 2-2v-4zm0-10l-1.41-1.41L17 20.17V2h-2v18.17l-7.59-7.58L6 14l10 10l10-10z' in svg_str:
downloads = parse_stat_number(stat_text)
# Like/heart icon: heart path
elif 'M22.45,6a5.47,5.47,0,0,1,3.91,1.64,5.7,5.7,0,0,1,0,8L16,26.13' in svg_str:
likes = parse_stat_number(stat_text)
# Model size icon: small grid icon (viewBox="0 0 12 12") with specific path for parameter count
elif 'M10 10H8.4V8.4H10V10Zm0-3.2H8.4V5.2H10v1.6ZM6.8 10H5.2V8.4h1.6V10Z' in svg_str:
# Model parameter count (e.g., "2B", "0.6B")
# Must be short and contain B for billion params
if len(stat_text) <= 6 and re.search(r'\d+\.?\d*\s*[Bb]', stat_text):
size = stat_text
models.append({
'name': model_name,
'url': f"https://huggingface.co/{model_name}",
'downloads': downloads,
'likes': likes,
'size': size
})
except Exception as e:
logs.append(f" Error parsing model card: {e}")
continue
except Exception as e:
logs.append(f" ERROR searching page {page}: {e}")
break
# Sort by downloads (descending)
models.sort(key=lambda x: x['downloads'], reverse=True)
logs.append(f"Total unique models found: {len(models)}")
return models, logs
def get_huggingface_stats(item_name, item_type='datasets'):
"""
Get likes and downloads for a HuggingFace dataset or model using API
item_type: 'datasets' or 'models'
Returns dict with likes and downloads
NOTE: This method is currently NOT USED. We parse stats directly from HTML instead.
Keeping it here as a fallback in case HTML parsing fails.
"""
try:
api_url = f"https://huggingface.co/api/{item_type}/{item_name}"
response = requests.get(api_url, timeout=5)
if response.status_code == 200:
data = response.json()
return {
'likes': data.get('likes', 0),
'downloads': data.get('downloads', 0)
}
except Exception:
pass
return {'likes': 0, 'downloads': 0}
def parse_stat_number(stat_text):
"""
Parse HuggingFace stat numbers like '4.07M', '23.4k', '349' into integers
Returns integer value or 0 if parsing fails
"""
if not stat_text:
return 0
stat_text = stat_text.strip().upper()
try:
# Handle 'M' (millions)
if 'M' in stat_text:
return int(float(stat_text.replace('M', '')) * 1_000_000)
# Handle 'K' (thousands)
elif 'K' in stat_text:
return int(float(stat_text.replace('K', '')) * 1_000)
# Plain number
else:
return int(stat_text.replace(',', ''))
except (ValueError, AttributeError):
return 0
def deduplicate_models(models):
"""
Deduplicate models by base name (without user/org prefix)
Keep the model with most downloads and count duplicates
Returns list of deduplicated models with duplicate count added
"""
from collections import defaultdict
# Group models by base name
grouped = defaultdict(list)
for model in models:
# Extract base name (everything after last '/')
name_parts = model['name'].split('/')
if len(name_parts) > 1:
base_name = name_parts[-1] # e.g., "whisper-large-v3"
else:
base_name = model['name']
grouped[base_name].append(model)
# For each group, keep the one with most downloads
deduplicated = []
for base_name, model_list in grouped.items():
# Sort by downloads (descending) and keep the first one
model_list.sort(key=lambda x: x['downloads'], reverse=True)
best_model = model_list[0]
# Add duplicate count (total in group)
best_model['duplicates'] = len(model_list) - 1
deduplicated.append(best_model)
# Sort by downloads again
deduplicated.sort(key=lambda x: x['downloads'], reverse=True)
return deduplicated
def search_huggingface_datasets(language_code, task_category, max_results=100, max_pages=3):
"""
Search HuggingFace for datasets supporting a specific language
task_category: 'automatic-speech-recognition' or 'text-to-speech'
max_results: maximum number of datasets to return
max_pages: maximum number of pages to search per language code
Returns tuple: (list of dataset dictionaries, log messages)
"""
lang_info = LANGUAGES.get(language_code)
logs = []
if not lang_info:
logs.append(f"No language info found for code: {language_code}")
return [], logs
# Collect all unique language codes for this language
language_codes = set()
if lang_info['alpha2']:
language_codes.add(lang_info['alpha2']) # 2-letter code
if lang_info['alpha3_b']:
language_codes.add(lang_info['alpha3_b']) # 3-letter code
if lang_info['alpha3_t']:
language_codes.add(lang_info['alpha3_t']) # 3-letter terminological
logs.append(f"Language codes to search: {language_codes}")
datasets = []
seen_datasets = set()
# Search separately for each language code
for code in language_codes:
if len(datasets) >= max_results:
break
logs.append(f"Searching for language code: {code}")
for page in range(max_pages):
if len(datasets) >= max_results:
break
try:
# Use HuggingFace dataset search with correct format
# Format: task_categories=task_categories:automatic-speech-recognition&language=language:en
url = f"https://huggingface.co/datasets?task_categories=task_categories:{task_category}&language=language:{code}&sort=trending"
if page > 0:
url += f"&p={page}"
logs.append(f" Page {page}: {url}")
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
}
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
# Parse dataset cards from the page
dataset_cards = soup.find_all('article', class_='overview-card-wrapper')
if not dataset_cards:
logs.append(f" No dataset cards found on page {page}")
break
logs.append(f" Found {len(dataset_cards)} dataset cards on page {page}")
for card in dataset_cards:
if len(datasets) >= max_results:
break
try:
link = card.find('a', href=True)
if link:
href = link.get('href', '')
dataset_path = href.lstrip('/')
# Remove "datasets/" prefix if present
if dataset_path.startswith('datasets/'):
dataset_name = dataset_path[9:] # Remove "datasets/" (9 chars)
else:
dataset_name = dataset_path
if dataset_name and dataset_name != '#' and dataset_name not in seen_datasets:
seen_datasets.add(dataset_name)
# Parse stats directly from the card HTML by looking at SVG icons
downloads = 0
likes = 0
size = ""
# Find all SVG elements in the card
svgs = card.find_all('svg')
for svg in svgs:
# Get the next sibling text after the SVG
# Could be direct text or text within a span/other element
next_elem = svg.find_next_sibling(string=True)
stat_text = ""
if next_elem and next_elem.strip():
stat_text = next_elem.strip()
else:
# Try to find text in the next sibling element (e.g., <span>)
next_tag = svg.find_next_sibling()
if next_tag:
stat_text = next_tag.get_text(strip=True)
# Skip non-numeric text like "Viewer", "Updated", etc.
if not stat_text or len(stat_text) < 1 or stat_text in ['Viewer', 'Updated']:
continue
# Identify icon type by viewBox or path content
svg_str = str(svg)
# Download icon: viewBox="0 0 32 32" with download arrow path
if 'M26 24v4H6v-4H4v4a2 2 0 0 0 2 2h20a2 2 0 0 0 2-2v-4zm0-10l-1.41-1.41L17 20.17V2h-2v18.17l-7.59-7.58L6 14l10 10l10-10z' in svg_str:
downloads = parse_stat_number(stat_text)
# Like/heart icon: heart path
elif 'M22.45,6a5.47,5.47,0,0,1,3.91,1.64,5.7,5.7,0,0,1,0,8L16,26.13' in svg_str:
likes = parse_stat_number(stat_text)
# Dataset size icon: table/grid icon with fill-rule="evenodd"
elif 'fill-rule="evenodd"' in svg_str and 'clip-rule="evenodd"' in svg_str:
# Dataset size (e.g., "411k", "23.4M", "65.1k")
# Must look like a number (has k, M, or digits)
if any(c in stat_text for c in ['k', 'K', 'm', 'M']) or stat_text.replace(',', '').replace('.', '').isdigit():
size = stat_text
datasets.append({
'name': dataset_name,
'url': f"https://huggingface.co/datasets/{dataset_name}",
'downloads': downloads,
'likes': likes,
'size': size
})
except Exception as e:
logs.append(f" Error parsing dataset card: {e}")
continue
except Exception as e:
logs.append(f" ERROR searching page {page}: {e}")
break
# Sort by downloads (descending)
datasets.sort(key=lambda x: x['downloads'], reverse=True)
logs.append(f"Total unique datasets found: {len(datasets)}")
return datasets, logs
def search_language_resources(language_code, deduplicate=False):
"""
Search for ASR/TTS resources for a given language
Returns results organized by service type
deduplicate: if True, remove duplicate models (same base name) and keep only the one with most downloads
"""
all_logs = []
if not language_code:
return None, None, None, 0, 0, None, None, 0, 0, ""
lang_info = LANGUAGES.get(language_code)
if not lang_info:
return None, None, None, 0, 0, None, None, 0, 0, ""
language_name = lang_info['name']
all_logs.append(f"=== Searching for {language_name} ({language_code}) ===")
all_logs.append(f"Language codes: alpha2={lang_info['alpha2']}, alpha3_b={lang_info['alpha3_b']}, alpha3_t={lang_info['alpha3_t']}")
# Fetch Azure data
all_logs.append("\n[Azure Speech Services]")
azure_asr = fetch_azure_asr_languages()
azure_tts = fetch_azure_tts_languages()
all_logs.append(f" Fetched {len(azure_asr)} ASR languages and {len(azure_tts)} TTS languages from Azure")
# Get matching Azure locales using alpha2 code
azure_locales = get_azure_locales_for_language(language_code)
all_logs.append(f" Matching Azure locales: {azure_locales}")
# Check Azure ASR support
azure_asr_locales = [loc for loc in azure_locales if loc in azure_asr]
azure_asr_available = len(azure_asr_locales) > 0
all_logs.append(f" Azure ASR: {'✅ Supported' if azure_asr_available else '❌ Not supported'} ({len(azure_asr_locales)} locales)")
# Check Azure TTS support and count voices
azure_tts_locales = [loc for loc in azure_locales if loc in azure_tts]
azure_tts_available = len(azure_tts_locales) > 0
azure_total_voices = sum(azure_tts[loc]['voice_count'] for loc in azure_tts_locales)
all_logs.append(f" Azure TTS: {'✅ Supported' if azure_tts_available else '❌ Not supported'} ({len(azure_tts_locales)} locales, {azure_total_voices} voices)")
# Fetch Google Cloud data
all_logs.append("\n[Google Cloud Speech]")
google_stt = fetch_google_stt_languages()
google_tts = fetch_google_tts_languages()
all_logs.append(f" Fetched {len(google_stt)} STT languages and {len(google_tts)} TTS languages from Google Cloud")
# Get matching Google Cloud locales using alpha2 code
google_locales = get_google_locales_for_language(language_code)
all_logs.append(f" Matching Google Cloud locales: {google_locales}")
# Check Google Cloud STT support
google_stt_locales = [loc for loc in google_locales if loc in google_stt]
google_stt_available = len(google_stt_locales) > 0
all_logs.append(f" Google STT: {'✅ Supported' if google_stt_available else '❌ Not supported'} ({len(google_stt_locales)} locales)")
# Check Google Cloud TTS support and count voices
google_tts_locales = [loc for loc in google_locales if loc in google_tts]
google_tts_available = len(google_tts_locales) > 0
google_total_voices = sum(google_tts[loc]['voice_count'] for loc in google_tts_locales)
all_logs.append(f" Google TTS: {'✅ Supported' if google_tts_available else '❌ Not supported'} ({len(google_tts_locales)} locales, {google_total_voices} voices)")
# Fetch AWS data
all_logs.append("\n[AWS (Transcribe + Polly)]")
aws_transcribe = fetch_aws_transcribe_languages()
aws_polly = fetch_aws_polly_languages()
all_logs.append(f" Fetched {len(aws_transcribe)} Transcribe languages and {len(aws_polly)} Polly languages from AWS")
# Get matching AWS locales using alpha2 code
aws_locales = get_aws_locales_for_language(language_code)
all_logs.append(f" Matching AWS locales: {aws_locales}")
# Check AWS Transcribe support
aws_transcribe_locales = [loc for loc in aws_locales if loc in aws_transcribe]
aws_transcribe_available = len(aws_transcribe_locales) > 0
all_logs.append(f" AWS Transcribe: {'✅ Supported' if aws_transcribe_available else '❌ Not supported'} ({len(aws_transcribe_locales)} locales)")
# Check AWS Polly support and count voices
aws_polly_locales = [loc for loc in aws_locales if loc in aws_polly]
aws_polly_available = len(aws_polly_locales) > 0
aws_total_voices = sum(aws_polly[loc]['voice_count'] for loc in aws_polly_locales)
all_logs.append(f" AWS Polly: {'✅ Supported' if aws_polly_available else '❌ Not supported'} ({len(aws_polly_locales)} locales, {aws_total_voices} voices)")
# Commercial Services
commercial_rows = []
# Azure Speech
if azure_asr_available:
azure_asr_text = f"✅ {len(azure_asr_locales)} locale(s)"
else:
azure_asr_text = "❌ N/A"
if azure_tts_available:
azure_tts_text = f"✅ {len(azure_tts_locales)} locale(s), {azure_total_voices} voice(s)"
else:
azure_tts_text = "❌ N/A"
commercial_rows.append({
"Service": "Azure Speech",
"ASR": azure_asr_text,
"TTS": azure_tts_text,
})
# Google Cloud Speech
if google_stt_available:
google_stt_text = f"✅ {len(google_stt_locales)} locale(s)"
else:
google_stt_text = "❌ N/A"
if google_tts_available:
google_tts_text = f"✅ {len(google_tts_locales)} locale(s), {google_total_voices} voice(s)"
else:
google_tts_text = "❌ N/A"
commercial_rows.append({
"Service": "Google Cloud Speech",
"ASR": google_stt_text,
"TTS": google_tts_text,
})
# AWS (Transcribe + Polly)
if aws_transcribe_available:
aws_transcribe_text = f"✅ {len(aws_transcribe_locales)} locale(s)"
else:
aws_transcribe_text = "❌ N/A"
if aws_polly_available:
aws_polly_text = f"✅ {len(aws_polly_locales)} locale(s), {aws_total_voices} voice(s)"
else:
aws_polly_text = "❌ N/A"
commercial_rows.append({
"Service": "AWS (Transcribe + Polly)",
"ASR": aws_transcribe_text,
"TTS": aws_polly_text,
})
# ElevenLabs Multilingual v2 (TTS only)
all_logs.append("\n[ElevenLabs]")
elevenlabs_v2_supported = check_elevenlabs_multilingual_v2_support(language_code)
all_logs.append(f" Multilingual v2: {'✅ Supported' if elevenlabs_v2_supported else '❌ Not supported'}")
if elevenlabs_v2_supported:
elevenlabs_v2_tts_text = "✅ Supported"
else:
elevenlabs_v2_tts_text = "❌ N/A"
commercial_rows.append({
"Service": "ElevenLabs Multilingual v2",
"ASR": "N/A", # ElevenLabs doesn't offer ASR
"TTS": elevenlabs_v2_tts_text,
})
# ElevenLabs Turbo v3 (TTS only)
elevenlabs_v3_supported = check_elevenlabs_turbo_v3_support(language_code)
all_logs.append(f" Turbo v3: {'✅ Supported' if elevenlabs_v3_supported else '❌ Not supported'}")
if elevenlabs_v3_supported:
elevenlabs_v3_tts_text = "✅ Supported"
else:
elevenlabs_v3_tts_text = "❌ N/A"
commercial_rows.append({
"Service": "ElevenLabs Turbo v3",
"ASR": "N/A", # ElevenLabs doesn't offer ASR
"TTS": elevenlabs_v3_tts_text,
})
commercial_df = pd.DataFrame(commercial_rows)
# HuggingFace Models - Search for real ASR and TTS models
all_logs.append("\n[HuggingFace Models]")
asr_models, asr_model_logs = search_huggingface_models(language_code, 'automatic-speech-recognition', max_results=100, max_pages=5)
all_logs.extend([f" [ASR] {log}" for log in asr_model_logs])
tts_models, tts_model_logs = search_huggingface_models(language_code, 'text-to-speech', max_results=100, max_pages=5)
all_logs.extend([f" [TTS] {log}" for log in tts_model_logs])
# Apply deduplication if requested
if deduplicate:
all_logs.append(f"\n[Deduplication]")
asr_before = len(asr_models)
asr_models = deduplicate_models(asr_models)
all_logs.append(f" ASR models: {asr_before}{len(asr_models)} (removed {asr_before - len(asr_models)} duplicates)")
tts_before = len(tts_models)
tts_models = deduplicate_models(tts_models)
all_logs.append(f" TTS models: {tts_before}{len(tts_models)} (removed {tts_before - len(tts_models)} duplicates)")
else:
# Add duplicates count of 1 for all models when not deduplicating
for model in asr_models:
model['duplicates'] = 1
for model in tts_models:
model['duplicates'] = 1
# Format ASR models with clickable names
asr_models_data = []
for model in asr_models:
asr_models_data.append({
"Model Name": f"[{model['name']}]({model['url']})",
"Downloads": model['downloads'],
"Likes": model['likes'],
"Size": model.get('size', ''),
"Duplicates": model.get('duplicates', 1)
})
if asr_models_data:
asr_models_df = pd.DataFrame(asr_models_data)
else:
# Empty dataframe if no models found
asr_models_df = pd.DataFrame(columns=["Model Name", "Downloads", "Likes", "Size", "Duplicates"])
# Format TTS models with clickable names
tts_models_data = []
for model in tts_models:
tts_models_data.append({
"Model Name": f"[{model['name']}]({model['url']})",
"Downloads": model['downloads'],
"Likes": model['likes'],
"Size": model.get('size', ''),
"Duplicates": model.get('duplicates', 1)
})
if tts_models_data:
tts_models_df = pd.DataFrame(tts_models_data)
else:
# Empty dataframe if no models found
tts_models_df = pd.DataFrame(columns=["Model Name", "Downloads", "Likes", "Size", "Duplicates"])
# HuggingFace Datasets - Search for real ASR and TTS datasets
all_logs.append("\n[HuggingFace Datasets]")
asr_datasets, asr_dataset_logs = search_huggingface_datasets(language_code, 'automatic-speech-recognition', max_results=100, max_pages=5)
all_logs.extend([f" [ASR] {log}" for log in asr_dataset_logs])
tts_datasets, tts_dataset_logs = search_huggingface_datasets(language_code, 'text-to-speech', max_results=100, max_pages=5)
all_logs.extend([f" [TTS] {log}" for log in tts_dataset_logs])
# Format ASR datasets with clickable names
asr_datasets_data = []
for dataset in asr_datasets:
asr_datasets_data.append({
"Dataset Name": f"[{dataset['name']}]({dataset['url']})",
"Downloads": dataset['downloads'],
"Likes": dataset['likes'],
"Size": dataset.get('size', '')
})
if asr_datasets_data:
asr_datasets_df = pd.DataFrame(asr_datasets_data)
else:
# Empty dataframe if no datasets found
asr_datasets_df = pd.DataFrame(columns=["Dataset Name", "Downloads", "Likes", "Size"])
# Format TTS datasets with clickable names
tts_datasets_data = []
for dataset in tts_datasets:
tts_datasets_data.append({
"Dataset Name": f"[{dataset['name']}]({dataset['url']})",
"Downloads": dataset['downloads'],
"Likes": dataset['likes'],
"Size": dataset.get('size', '')
})
if tts_datasets_data:
tts_datasets_df = pd.DataFrame(tts_datasets_data)
else:
# Empty dataframe if no datasets found
tts_datasets_df = pd.DataFrame(columns=["Dataset Name", "Downloads", "Likes", "Size"])
# Combine all logs
log_text = "\n".join(all_logs)
# Return separate ASR and TTS dataframes, plus counts for tab labels, plus logs
return commercial_df, asr_models_df, tts_models_df, len(asr_models), len(tts_models), asr_datasets_df, tts_datasets_df, len(asr_datasets), len(tts_datasets), log_text
# Initialize - load language list and app content
print("Initializing Speech Resource Finder...")
load_app_content()
load_language_list()
load_language_taxonomy()
# Create language choices for dropdown (code: name format for easy searching)
language_choices = [f"{code}: {info['name']}" for code, info in sorted(LANGUAGES.items(), key=lambda x: x[1]['name'])]
print(f"Created dropdown with {len(language_choices)} language options")
with gr.Blocks(title=APP_CONTENT["title"]) as demo:
gr.Markdown(f"# {APP_CONTENT['title']}")
gr.Markdown(APP_CONTENT["description"])
with gr.Row(equal_height=True):
with gr.Column(scale=2):
language_dropdown = gr.Dropdown(
choices=language_choices,
label="Select Language",
info="Type to search for a language",
allow_custom_value=False,
filterable=True,
)
with gr.Column(scale=1):
language_metadata = gr.HTML(
"""<div style='padding: 15px; border: 2px solid #e0e0e0; border-radius: 4px; background-color: #fafafa; height: 100%; display: flex; align-items: center; justify-content: center; box-sizing: border-box;'>
<p style='margin: 0; color: #333; font-size: 14px;'>Select a language to see resource classification</p>
</div>""",
elem_id="language-metadata"
)
gr.Markdown("## Commercial Services")
commercial_table = gr.Dataframe(
headers=["Service", "ASR", "TTS"],
interactive=False,
wrap=True,
)
gr.Markdown("## HuggingFace Models")
with gr.Row():
deduplicate_checkbox = gr.Checkbox(
label="Deduplicate models",
value=True,
info="Keep only the model with most downloads for each base name"
)
# Create tabs for ASR and TTS models with count labels
with gr.Tabs():
with gr.Tab(label="ASR Models") as asr_tab:
asr_count_label = gr.Markdown("*Loading...*")
asr_models_table = gr.Dataframe(
headers=["Model Name", "Downloads", "Likes", "Size", "Duplicates"],
interactive=False,
wrap=True,
datatype=["markdown", "number", "number", "str", "number"],
)
with gr.Tab(label="TTS Models") as tts_tab:
tts_count_label = gr.Markdown("*Loading...*")
tts_models_table = gr.Dataframe(
headers=["Model Name", "Downloads", "Likes", "Size", "Duplicates"],
interactive=False,
wrap=True,
datatype=["markdown", "number", "number", "str", "number"],
)
gr.Markdown("## HuggingFace Datasets")
# Create tabs for ASR and TTS datasets with count labels
with gr.Tabs():
with gr.Tab(label="ASR Datasets") as asr_datasets_tab:
asr_datasets_count_label = gr.Markdown("*Loading...*")
asr_datasets_table = gr.Dataframe(
headers=["Dataset Name", "Downloads", "Likes", "Size"],
interactive=False,
wrap=True,
datatype=["markdown", "number", "number", "str"],
)
with gr.Tab(label="TTS Datasets") as tts_datasets_tab:
tts_datasets_count_label = gr.Markdown("*Loading...*")
tts_datasets_table = gr.Dataframe(
headers=["Dataset Name", "Downloads", "Likes", "Size"],
interactive=False,
wrap=True,
datatype=["markdown", "number", "number", "str"],
)
with gr.Accordion("Logs", open=False):
log_textbox = gr.Textbox(
show_label=False,
lines=15,
max_lines=30,
interactive=False,
placeholder="Logs will appear here...",
autoscroll=True,
)
# About section with full content
with gr.Accordion("About this tool", open=False):
gr.Markdown(APP_CONTENT["full_content"])
def on_search(language_selection, deduplicate):
if not language_selection:
default_html = """<div style='padding: 15px; border: 2px solid #e0e0e0; border-radius: 4px; background-color: #fafafa; height: 100%; display: flex; align-items: center; justify-content: center; box-sizing: border-box;'>
<p style='margin: 0; color: #333; font-size: 14px;'>Select a language to see resource classification</p>
</div>"""
return default_html, None, "", None, "", None, "", None, "", None, ""
# Extract the language code from "code: name" format
language_code = language_selection.split(":")[0].strip()
# Get language name for taxonomy lookup
language_name = LANGUAGES.get(language_code, {}).get("name", "")
# Get taxonomy classification
level, classification = get_language_taxonomy_info(language_name)
# Create metadata display with color coding
if level is not None:
color = get_taxonomy_color(level)
metadata_html = f"""<div style='padding: 15px; border: 2px solid {color}; border-radius: 4px; background-color: #ffffff; height: 100%; box-sizing: border-box; display: flex; flex-direction: column; justify-content: center;'>
<h4 style='margin: 0 0 8px 0; color: #333; font-size: 16px;'>{language_name}</h4>
<div style='margin: 8px 0;'>
<span style='padding: 6px 12px; background-color: {color}; color: white; border-radius: 4px; font-weight: bold; font-size: 13px;'>{classification}</span>
</div>
<p style='margin: 8px 0 0 0; font-size: 11px; color: #555;'>Source: <a href='https://microsoft.github.io/linguisticdiversity/' target='_blank' style='color: #1976d2; text-decoration: none;'>Joshi et al.</a></p>
</div>"""
else:
metadata_html = f"""<div style='padding: 15px; border: 2px solid #757575; border-radius: 4px; background-color: #ffffff; height: 100%; box-sizing: border-box; display: flex; flex-direction: column; justify-content: center;'>
<h4 style='margin: 0 0 8px 0; color: #333; font-size: 16px;'>{language_name}</h4>
<div style='margin: 8px 0;'>
<span style='padding: 6px 12px; background-color: #757575; color: white; border-radius: 4px; font-weight: bold; font-size: 13px;'>Unknown</span>
</div>
<p style='margin: 8px 0 0 0; font-size: 11px; color: #555;'>Source: <a href='https://microsoft.github.io/linguisticdiversity/' target='_blank' style='color: #1976d2; text-decoration: none;'>Joshi et al.</a></p>
</div>"""
commercial_df, asr_models_df, tts_models_df, asr_models_count, tts_models_count, asr_datasets_df, tts_datasets_df, asr_datasets_count, tts_datasets_count, logs = search_language_resources(language_code, deduplicate=deduplicate)
# Create count labels
asr_models_label = f"**Found {asr_models_count} ASR model(s)**"
tts_models_label = f"**Found {tts_models_count} TTS model(s)**"
asr_datasets_label = f"**Found {asr_datasets_count} ASR dataset(s)**"
tts_datasets_label = f"**Found {tts_datasets_count} TTS dataset(s)**"
return metadata_html, commercial_df, asr_models_label, asr_models_df, tts_models_label, tts_models_df, asr_datasets_label, asr_datasets_df, tts_datasets_label, tts_datasets_df, logs
# Trigger search when language is selected
language_dropdown.change(
fn=on_search,
inputs=[language_dropdown, deduplicate_checkbox],
outputs=[language_metadata, commercial_table, asr_count_label, asr_models_table, tts_count_label, tts_models_table, asr_datasets_count_label, asr_datasets_table, tts_datasets_count_label, tts_datasets_table, log_textbox],
)
# Trigger search when deduplicate checkbox is changed
deduplicate_checkbox.change(
fn=on_search,
inputs=[language_dropdown, deduplicate_checkbox],
outputs=[language_metadata, commercial_table, asr_count_label, asr_models_table, tts_count_label, tts_models_table, asr_datasets_count_label, asr_datasets_table, tts_datasets_count_label, tts_datasets_table, log_textbox],
)
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7860, share=False, show_error=True)