|
|
import gradio as gr |
|
|
import pandas as pd |
|
|
import requests |
|
|
from bs4 import BeautifulSoup |
|
|
from functools import lru_cache |
|
|
import csv |
|
|
from io import StringIO |
|
|
import re |
|
|
|
|
|
|
|
|
LANGUAGE_CODES_FILE = "language-codes-full.csv" |
|
|
APP_CONTENT_FILE = "app_content.md" |
|
|
LANGUAGE_TAXONOMY_URL = "https://microsoft.github.io/linguisticdiversity/assets/lang2tax.txt" |
|
|
|
|
|
|
|
|
|
|
|
LANGUAGES = {} |
|
|
|
|
|
|
|
|
|
|
|
LANGUAGE_TAXONOMY = {} |
|
|
|
|
|
|
|
|
TAXONOMY_LEVELS = { |
|
|
0: "The Left-Behinds", |
|
|
1: "The Scraping-Bys", |
|
|
2: "The Hopefuls", |
|
|
3: "The Rising Stars", |
|
|
4: "The Underdogs", |
|
|
5: "The Winners" |
|
|
} |
|
|
|
|
|
|
|
|
APP_CONTENT = { |
|
|
"title": "Speech Resource Finder", |
|
|
"description": "Search for speech resources", |
|
|
"full_content": "" |
|
|
} |
|
|
|
|
|
def load_app_content(content_path=None): |
|
|
"""Load app content from markdown file""" |
|
|
global APP_CONTENT |
|
|
if content_path is None: |
|
|
content_path = APP_CONTENT_FILE |
|
|
|
|
|
try: |
|
|
with open(content_path, 'r', encoding='utf-8') as f: |
|
|
content = f.read() |
|
|
|
|
|
|
|
|
lines = content.split('\n') |
|
|
|
|
|
|
|
|
title = "Speech Resource Finder" |
|
|
for line in lines: |
|
|
if line.startswith('# '): |
|
|
title = line[2:].strip() |
|
|
break |
|
|
|
|
|
|
|
|
description = "" |
|
|
in_description = False |
|
|
for line in lines: |
|
|
if line.startswith('## Description'): |
|
|
in_description = True |
|
|
continue |
|
|
elif in_description and line.startswith('##'): |
|
|
break |
|
|
elif in_description and line.strip(): |
|
|
description += line.strip() + " " |
|
|
|
|
|
APP_CONTENT = { |
|
|
"title": title, |
|
|
"description": description.strip(), |
|
|
"full_content": content |
|
|
} |
|
|
print(f"Loaded app content from {content_path}") |
|
|
except Exception as e: |
|
|
print(f"Error loading app content: {e}") |
|
|
print("Using default content") |
|
|
|
|
|
def load_language_list(csv_path=None): |
|
|
"""Load ISO 639 language codes from CSV file""" |
|
|
global LANGUAGES |
|
|
if csv_path is None: |
|
|
csv_path = LANGUAGE_CODES_FILE |
|
|
|
|
|
try: |
|
|
with open(csv_path, 'r', encoding='utf-8') as f: |
|
|
reader = csv.DictReader(f) |
|
|
for row in reader: |
|
|
|
|
|
code_b = row['alpha3-b'].strip() |
|
|
code_t = row['alpha3-t'].strip() |
|
|
code_2 = row['alpha2'].strip() |
|
|
name = row['English'].strip() |
|
|
|
|
|
primary_code = code_b if code_b else code_t |
|
|
|
|
|
if primary_code and name: |
|
|
LANGUAGES[primary_code] = { |
|
|
"name": name, |
|
|
"alpha3_b": code_b, |
|
|
"alpha3_t": code_t, |
|
|
"alpha2": code_2 |
|
|
} |
|
|
print(f"Loaded {len(LANGUAGES)} languages from {csv_path}") |
|
|
except Exception as e: |
|
|
print(f"Error loading language list: {e}") |
|
|
|
|
|
LANGUAGES = { |
|
|
"eng": {"name": "English", "alpha3_b": "eng", "alpha3_t": "", "alpha2": "en"}, |
|
|
"spa": {"name": "Spanish", "alpha3_b": "spa", "alpha3_t": "", "alpha2": "es"}, |
|
|
"fra": {"name": "French", "alpha3_b": "fra", "alpha3_t": "", "alpha2": "fr"}, |
|
|
"deu": {"name": "German", "alpha3_b": "ger", "alpha3_t": "deu", "alpha2": "de"}, |
|
|
} |
|
|
print(f"Using fallback with {len(LANGUAGES)} languages") |
|
|
|
|
|
def load_language_taxonomy(): |
|
|
"""Load language taxonomy data from Microsoft's linguistic diversity project""" |
|
|
global LANGUAGE_TAXONOMY |
|
|
|
|
|
try: |
|
|
response = requests.get(LANGUAGE_TAXONOMY_URL, timeout=10) |
|
|
response.raise_for_status() |
|
|
|
|
|
|
|
|
for line in response.text.strip().split('\n'): |
|
|
if line.strip(): |
|
|
parts = line.strip().split(',') |
|
|
if len(parts) == 2: |
|
|
lang_name = parts[0].strip().lower() |
|
|
level = int(parts[1].strip()) |
|
|
LANGUAGE_TAXONOMY[lang_name] = level |
|
|
|
|
|
print(f"Loaded taxonomy data for {len(LANGUAGE_TAXONOMY)} languages") |
|
|
except Exception as e: |
|
|
print(f"Warning: Could not load language taxonomy: {e}") |
|
|
print("Language classification will show as 'Unknown'") |
|
|
|
|
|
def get_taxonomy_color(level): |
|
|
""" |
|
|
Get color code for taxonomy level (red for left-behind, green for winners) |
|
|
""" |
|
|
colors = { |
|
|
0: "#d32f2f", |
|
|
1: "#f57c00", |
|
|
2: "#fbc02d", |
|
|
3: "#afb42b", |
|
|
4: "#7cb342", |
|
|
5: "#388e3c", |
|
|
} |
|
|
return colors.get(level, "#757575") |
|
|
|
|
|
def get_language_taxonomy_info(language_name): |
|
|
""" |
|
|
Get taxonomy classification for a language. |
|
|
Returns a tuple of (level, description) or (None, "Unknown") |
|
|
""" |
|
|
if not language_name: |
|
|
return None, "Unknown" |
|
|
|
|
|
|
|
|
lang_lower = language_name.lower() |
|
|
if lang_lower in LANGUAGE_TAXONOMY: |
|
|
level = LANGUAGE_TAXONOMY[lang_lower] |
|
|
return level, TAXONOMY_LEVELS.get(level, f"Level {level}") |
|
|
|
|
|
|
|
|
if ';' in lang_lower: |
|
|
parts = [p.strip() for p in lang_lower.split(';')] |
|
|
for part in parts: |
|
|
if part in LANGUAGE_TAXONOMY: |
|
|
level = LANGUAGE_TAXONOMY[part] |
|
|
return level, TAXONOMY_LEVELS.get(level, f"Level {level}") |
|
|
|
|
|
|
|
|
if ',' in lang_lower: |
|
|
parts = [p.strip() for p in lang_lower.split(',')] |
|
|
for part in parts: |
|
|
if part in LANGUAGE_TAXONOMY: |
|
|
level = LANGUAGE_TAXONOMY[part] |
|
|
return level, TAXONOMY_LEVELS.get(level, f"Level {level}") |
|
|
|
|
|
return None, "Unknown" |
|
|
|
|
|
@lru_cache(maxsize=1) |
|
|
def fetch_azure_asr_languages(): |
|
|
"""Scrape Azure Speech-to-Text supported languages""" |
|
|
url = "https://learn.microsoft.com/en-us/azure/ai-services/speech-service/language-support?tabs=stt" |
|
|
|
|
|
try: |
|
|
response = requests.get(url, timeout=10) |
|
|
response.raise_for_status() |
|
|
soup = BeautifulSoup(response.content, 'html.parser') |
|
|
|
|
|
|
|
|
|
|
|
tables = soup.find_all('table') |
|
|
|
|
|
azure_asr = {} |
|
|
for table in tables: |
|
|
rows = table.find_all('tr') |
|
|
if not rows: |
|
|
continue |
|
|
|
|
|
|
|
|
headers = [th.get_text(strip=True) for th in rows[0].find_all('th')] |
|
|
if 'Locale' in ' '.join(headers) or 'Language' in ' '.join(headers): |
|
|
for row in rows[1:]: |
|
|
cols = row.find_all('td') |
|
|
if len(cols) >= 2: |
|
|
locale = cols[0].get_text(strip=True) |
|
|
language = cols[1].get_text(strip=True) |
|
|
if locale and language: |
|
|
azure_asr[locale] = language |
|
|
break |
|
|
|
|
|
return azure_asr |
|
|
except Exception as e: |
|
|
print(f"Error fetching Azure ASR data: {e}") |
|
|
return {} |
|
|
|
|
|
@lru_cache(maxsize=1) |
|
|
def fetch_azure_tts_languages(): |
|
|
"""Scrape Azure Text-to-Speech supported languages with voice counts""" |
|
|
url = "https://learn.microsoft.com/en-us/azure/ai-services/speech-service/language-support?tabs=tts" |
|
|
|
|
|
try: |
|
|
response = requests.get(url, timeout=10) |
|
|
response.raise_for_status() |
|
|
soup = BeautifulSoup(response.content, 'html.parser') |
|
|
|
|
|
|
|
|
|
|
|
tables = soup.find_all('table') |
|
|
|
|
|
azure_tts = {} |
|
|
for table in tables: |
|
|
rows = table.find_all('tr') |
|
|
if not rows: |
|
|
continue |
|
|
|
|
|
headers = [th.get_text(strip=True) for th in rows[0].find_all('th')] |
|
|
if 'Text to speech' in ' '.join(headers) or 'voices' in ' '.join(headers).lower(): |
|
|
for row in rows[1:]: |
|
|
cols = row.find_all('td') |
|
|
if len(cols) >= 3: |
|
|
locale = cols[0].get_text(strip=True) |
|
|
language = cols[1].get_text(strip=True) |
|
|
voices_text = cols[2].get_text(strip=True) |
|
|
|
|
|
voice_count = voices_text.count('Neural') |
|
|
if locale and language: |
|
|
azure_tts[locale] = { |
|
|
'language': language, |
|
|
'voice_count': voice_count |
|
|
} |
|
|
break |
|
|
|
|
|
return azure_tts |
|
|
except Exception as e: |
|
|
print(f"Error fetching Azure TTS data: {e}") |
|
|
return {} |
|
|
|
|
|
@lru_cache(maxsize=1) |
|
|
def fetch_google_stt_languages(): |
|
|
"""Scrape Google Cloud Speech-to-Text supported languages""" |
|
|
url = "https://cloud.google.com/speech-to-text/docs/speech-to-text-supported-languages" |
|
|
|
|
|
try: |
|
|
response = requests.get(url, timeout=10) |
|
|
response.raise_for_status() |
|
|
soup = BeautifulSoup(response.content, 'html.parser') |
|
|
|
|
|
|
|
|
tables = soup.find_all('table') |
|
|
|
|
|
google_stt = {} |
|
|
for table in tables: |
|
|
rows = table.find_all('tr') |
|
|
if not rows: |
|
|
continue |
|
|
|
|
|
|
|
|
headers = [th.get_text(strip=True) for th in rows[0].find_all('th')] |
|
|
|
|
|
|
|
|
bcp47_idx = None |
|
|
name_idx = None |
|
|
for idx, header in enumerate(headers): |
|
|
if 'BCP-47' in header or 'BCP47' in header: |
|
|
bcp47_idx = idx |
|
|
if 'Name' in header and name_idx is None: |
|
|
name_idx = idx |
|
|
|
|
|
if bcp47_idx is not None: |
|
|
for row in rows[1:]: |
|
|
cols = row.find_all('td') |
|
|
if len(cols) > bcp47_idx: |
|
|
locale = cols[bcp47_idx].get_text(strip=True) |
|
|
language = cols[name_idx].get_text(strip=True) if name_idx and len(cols) > name_idx else '' |
|
|
if locale and locale not in ['—', '-', '']: |
|
|
google_stt[locale] = language |
|
|
|
|
|
return google_stt |
|
|
except Exception as e: |
|
|
print(f"Error fetching Google STT data: {e}") |
|
|
return {} |
|
|
|
|
|
@lru_cache(maxsize=1) |
|
|
def fetch_google_tts_languages(): |
|
|
"""Scrape Google Cloud Text-to-Speech supported languages with voice counts""" |
|
|
url = "https://cloud.google.com/text-to-speech/docs/voices" |
|
|
|
|
|
try: |
|
|
response = requests.get(url, timeout=10) |
|
|
response.raise_for_status() |
|
|
soup = BeautifulSoup(response.content, 'html.parser') |
|
|
|
|
|
|
|
|
|
|
|
tables = soup.find_all('table') |
|
|
|
|
|
google_tts = {} |
|
|
for table in tables: |
|
|
rows = table.find_all('tr') |
|
|
if not rows: |
|
|
continue |
|
|
|
|
|
headers = [th.get_text(strip=True) for th in rows[0].find_all('th')] |
|
|
|
|
|
|
|
|
lang_code_idx = None |
|
|
for idx, header in enumerate(headers): |
|
|
if 'Language code' in header or 'language code' in header.lower(): |
|
|
lang_code_idx = idx |
|
|
break |
|
|
|
|
|
if lang_code_idx is not None: |
|
|
for row in rows[1:]: |
|
|
cols = row.find_all('td') |
|
|
if len(cols) > lang_code_idx: |
|
|
locale = cols[lang_code_idx].get_text(strip=True) |
|
|
if locale and locale not in ['—', '-', '']: |
|
|
|
|
|
if locale in google_tts: |
|
|
google_tts[locale]['voice_count'] += 1 |
|
|
else: |
|
|
language = cols[0].get_text(strip=True) if len(cols) > 0 else '' |
|
|
google_tts[locale] = { |
|
|
'language': language, |
|
|
'voice_count': 1 |
|
|
} |
|
|
|
|
|
return google_tts |
|
|
except Exception as e: |
|
|
print(f"Error fetching Google TTS data: {e}") |
|
|
return {} |
|
|
|
|
|
@lru_cache(maxsize=1) |
|
|
def fetch_elevenlabs_multilingual_v2(): |
|
|
"""Get ElevenLabs Multilingual v2 supported languages""" |
|
|
|
|
|
|
|
|
supported_codes = { |
|
|
'en', 'ja', 'zh', 'de', 'hi', 'fr', 'ko', 'pt', 'it', 'es', |
|
|
'id', 'nl', 'tr', 'fil', 'pl', 'sv', 'bg', 'ro', 'ar', 'cs', |
|
|
'el', 'fi', 'hr', 'ms', 'sk', 'da', 'ta', 'uk', 'ru' |
|
|
} |
|
|
return supported_codes |
|
|
|
|
|
@lru_cache(maxsize=1) |
|
|
def fetch_elevenlabs_turbo_v3(): |
|
|
"""Get ElevenLabs Eleven Turbo v3 (formerly v3 Alpha) supported languages""" |
|
|
|
|
|
|
|
|
supported_codes = { |
|
|
'afr', 'ara', 'hye', 'asm', 'aze', 'bel', 'ben', 'bos', 'bul', 'cat', |
|
|
'ceb', 'nya', 'hrv', 'ces', 'dan', 'nld', 'eng', 'est', 'fil', 'fin', |
|
|
'fra', 'glg', 'kat', 'deu', 'ell', 'guj', 'hau', 'heb', 'hin', 'hun', |
|
|
'isl', 'ind', 'gle', 'ita', 'jpn', 'jav', 'kan', 'kaz', 'kir', 'kor', |
|
|
'lav', 'lin', 'lit', 'ltz', 'mkd', 'msa', 'mal', 'cmn', 'mar', 'nep', |
|
|
'nor', 'pus', 'fas', 'pol', 'por', 'pan', 'ron', 'rus', 'srp', 'snd', |
|
|
'slk', 'slv', 'som', 'spa', 'swa', 'swe', 'tam', 'tel', 'tha', 'tur', |
|
|
'ukr', 'urd', 'vie', 'cym' |
|
|
} |
|
|
return supported_codes |
|
|
|
|
|
@lru_cache(maxsize=1) |
|
|
def fetch_aws_transcribe_languages(): |
|
|
"""Scrape AWS Transcribe (ASR) supported languages""" |
|
|
url = "https://docs.aws.amazon.com/transcribe/latest/dg/supported-languages.html" |
|
|
|
|
|
try: |
|
|
response = requests.get(url, timeout=10) |
|
|
response.raise_for_status() |
|
|
soup = BeautifulSoup(response.content, 'html.parser') |
|
|
|
|
|
|
|
|
tables = soup.find_all('table') |
|
|
|
|
|
aws_transcribe = {} |
|
|
for table in tables: |
|
|
rows = table.find_all('tr') |
|
|
if not rows: |
|
|
continue |
|
|
|
|
|
|
|
|
headers = [th.get_text(strip=True) for th in rows[0].find_all('th')] |
|
|
|
|
|
|
|
|
lang_code_idx = None |
|
|
lang_name_idx = None |
|
|
for idx, header in enumerate(headers): |
|
|
if 'Language code' in header or 'language code' in header.lower(): |
|
|
lang_code_idx = idx |
|
|
if 'Language' == header or header.startswith('Language'): |
|
|
lang_name_idx = idx |
|
|
|
|
|
if lang_code_idx is not None: |
|
|
for row in rows[1:]: |
|
|
cols = row.find_all('td') |
|
|
if len(cols) > lang_code_idx: |
|
|
locale = cols[lang_code_idx].get_text(strip=True) |
|
|
language = cols[lang_name_idx].get_text(strip=True) if lang_name_idx and len(cols) > lang_name_idx else '' |
|
|
if locale and locale not in ['—', '-', '']: |
|
|
aws_transcribe[locale] = language |
|
|
|
|
|
return aws_transcribe |
|
|
except Exception as e: |
|
|
print(f"Error fetching AWS Transcribe data: {e}") |
|
|
return {} |
|
|
|
|
|
@lru_cache(maxsize=1) |
|
|
def fetch_aws_polly_languages(): |
|
|
"""Scrape AWS Polly (TTS) supported languages""" |
|
|
url = "https://docs.aws.amazon.com/polly/latest/dg/supported-languages.html" |
|
|
|
|
|
try: |
|
|
response = requests.get(url, timeout=10) |
|
|
response.raise_for_status() |
|
|
soup = BeautifulSoup(response.content, 'html.parser') |
|
|
|
|
|
|
|
|
tables = soup.find_all('table') |
|
|
|
|
|
aws_polly = {} |
|
|
for table in tables: |
|
|
rows = table.find_all('tr') |
|
|
if not rows: |
|
|
continue |
|
|
|
|
|
|
|
|
headers = [th.get_text(strip=True) for th in rows[0].find_all('th')] |
|
|
|
|
|
|
|
|
lang_code_idx = None |
|
|
lang_name_idx = None |
|
|
for idx, header in enumerate(headers): |
|
|
if 'Language code' in header or 'language code' in header.lower(): |
|
|
lang_code_idx = idx |
|
|
if 'Language' == header or header.startswith('Language'): |
|
|
lang_name_idx = idx |
|
|
|
|
|
if lang_code_idx is not None: |
|
|
for row in rows[1:]: |
|
|
cols = row.find_all('td') |
|
|
if len(cols) > lang_code_idx: |
|
|
locale = cols[lang_code_idx].get_text(strip=True) |
|
|
language = cols[lang_name_idx].get_text(strip=True) if lang_name_idx and len(cols) > lang_name_idx else '' |
|
|
if locale and locale not in ['—', '-', '']: |
|
|
|
|
|
if locale in aws_polly: |
|
|
aws_polly[locale]['voice_count'] += 1 |
|
|
else: |
|
|
aws_polly[locale] = { |
|
|
'language': language, |
|
|
'voice_count': 1 |
|
|
} |
|
|
|
|
|
return aws_polly |
|
|
except Exception as e: |
|
|
print(f"Error fetching AWS Polly data: {e}") |
|
|
return {} |
|
|
|
|
|
def get_azure_locales_for_language(language_code): |
|
|
""" |
|
|
Get Azure BCP-47 locales for a language using its alpha2 code |
|
|
Returns list of matching locales from Azure |
|
|
""" |
|
|
lang_info = LANGUAGES.get(language_code) |
|
|
if not lang_info or not lang_info['alpha2']: |
|
|
return [] |
|
|
|
|
|
alpha2 = lang_info['alpha2'] |
|
|
azure_asr = fetch_azure_asr_languages() |
|
|
azure_tts = fetch_azure_tts_languages() |
|
|
|
|
|
|
|
|
matching_locales = set() |
|
|
|
|
|
for locale in azure_asr.keys(): |
|
|
if locale.startswith(alpha2 + '-') or locale == alpha2: |
|
|
matching_locales.add(locale) |
|
|
|
|
|
for locale in azure_tts.keys(): |
|
|
if locale.startswith(alpha2 + '-') or locale == alpha2: |
|
|
matching_locales.add(locale) |
|
|
|
|
|
return sorted(matching_locales) |
|
|
|
|
|
def get_google_locales_for_language(language_code): |
|
|
""" |
|
|
Get Google Cloud BCP-47 locales for a language using its alpha2 code |
|
|
Returns list of matching locales from Google Cloud |
|
|
""" |
|
|
lang_info = LANGUAGES.get(language_code) |
|
|
if not lang_info or not lang_info['alpha2']: |
|
|
return [] |
|
|
|
|
|
alpha2 = lang_info['alpha2'] |
|
|
google_stt = fetch_google_stt_languages() |
|
|
google_tts = fetch_google_tts_languages() |
|
|
|
|
|
|
|
|
matching_locales = set() |
|
|
|
|
|
for locale in google_stt.keys(): |
|
|
if locale.startswith(alpha2 + '-') or locale == alpha2: |
|
|
matching_locales.add(locale) |
|
|
|
|
|
for locale in google_tts.keys(): |
|
|
if locale.startswith(alpha2 + '-') or locale == alpha2: |
|
|
matching_locales.add(locale) |
|
|
|
|
|
return sorted(matching_locales) |
|
|
|
|
|
def check_elevenlabs_multilingual_v2_support(language_code): |
|
|
""" |
|
|
Check if ElevenLabs Multilingual v2 supports a language using ISO 639-1 (alpha2) codes |
|
|
Returns True if supported, False otherwise |
|
|
""" |
|
|
lang_info = LANGUAGES.get(language_code) |
|
|
if not lang_info: |
|
|
return False |
|
|
|
|
|
supported_codes = fetch_elevenlabs_multilingual_v2() |
|
|
|
|
|
|
|
|
if lang_info['alpha2'] and lang_info['alpha2'] in supported_codes: |
|
|
return True |
|
|
|
|
|
return False |
|
|
|
|
|
def check_elevenlabs_turbo_v3_support(language_code): |
|
|
""" |
|
|
Check if ElevenLabs Turbo v3 supports a language using ISO 639-3 (alpha3) codes |
|
|
Returns True if supported, False otherwise |
|
|
""" |
|
|
lang_info = LANGUAGES.get(language_code) |
|
|
if not lang_info: |
|
|
return False |
|
|
|
|
|
supported_codes = fetch_elevenlabs_turbo_v3() |
|
|
|
|
|
|
|
|
if lang_info['alpha3_b'] and lang_info['alpha3_b'] in supported_codes: |
|
|
return True |
|
|
|
|
|
|
|
|
if lang_info['alpha3_t'] and lang_info['alpha3_t'] in supported_codes: |
|
|
return True |
|
|
|
|
|
return False |
|
|
|
|
|
def get_aws_locales_for_language(language_code): |
|
|
""" |
|
|
Get AWS locales for a language using its alpha2 code |
|
|
Returns list of matching locales from AWS Transcribe and Polly |
|
|
""" |
|
|
lang_info = LANGUAGES.get(language_code) |
|
|
if not lang_info or not lang_info['alpha2']: |
|
|
return [] |
|
|
|
|
|
alpha2 = lang_info['alpha2'] |
|
|
aws_transcribe = fetch_aws_transcribe_languages() |
|
|
aws_polly = fetch_aws_polly_languages() |
|
|
|
|
|
|
|
|
matching_locales = set() |
|
|
|
|
|
for locale in aws_transcribe.keys(): |
|
|
if locale.startswith(alpha2 + '-') or locale == alpha2: |
|
|
matching_locales.add(locale) |
|
|
|
|
|
for locale in aws_polly.keys(): |
|
|
if locale.startswith(alpha2 + '-') or locale == alpha2: |
|
|
matching_locales.add(locale) |
|
|
|
|
|
return sorted(matching_locales) |
|
|
|
|
|
def search_huggingface_models(language_code, pipeline_tag, max_results=100, max_pages=3): |
|
|
""" |
|
|
Search HuggingFace for models supporting a specific language |
|
|
pipeline_tag: 'automatic-speech-recognition' or 'text-to-speech' |
|
|
max_results: maximum number of models to return |
|
|
max_pages: maximum number of pages to search per language code |
|
|
Returns tuple: (list of model dictionaries, log messages) |
|
|
""" |
|
|
lang_info = LANGUAGES.get(language_code) |
|
|
logs = [] |
|
|
|
|
|
if not lang_info: |
|
|
logs.append(f"No language info found for code: {language_code}") |
|
|
return [], logs |
|
|
|
|
|
|
|
|
codes_to_try = [] |
|
|
if lang_info['alpha2']: |
|
|
codes_to_try.append(lang_info['alpha2']) |
|
|
if lang_info['alpha3_b']: |
|
|
codes_to_try.append(lang_info['alpha3_b']) |
|
|
if lang_info['alpha3_t']: |
|
|
codes_to_try.append(lang_info['alpha3_t']) |
|
|
|
|
|
logs.append(f"Language codes to search: {set(codes_to_try)}") |
|
|
|
|
|
models = [] |
|
|
seen_models = set() |
|
|
|
|
|
for code in codes_to_try: |
|
|
if len(models) >= max_results: |
|
|
break |
|
|
|
|
|
logs.append(f"Searching for language code: {code}") |
|
|
|
|
|
|
|
|
for page in range(max_pages): |
|
|
if len(models) >= max_results: |
|
|
break |
|
|
|
|
|
try: |
|
|
|
|
|
url = f"https://huggingface.co/models?pipeline_tag={pipeline_tag}&language={code}&sort=trending" |
|
|
if page > 0: |
|
|
url += f"&p={page}" |
|
|
|
|
|
logs.append(f" Page {page}: {url}") |
|
|
|
|
|
headers = { |
|
|
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36' |
|
|
} |
|
|
|
|
|
response = requests.get(url, headers=headers, timeout=10) |
|
|
response.raise_for_status() |
|
|
|
|
|
soup = BeautifulSoup(response.content, 'html.parser') |
|
|
|
|
|
|
|
|
model_cards = soup.find_all('article', class_='overview-card-wrapper') |
|
|
|
|
|
if not model_cards: |
|
|
logs.append(f" No model cards found on page {page}") |
|
|
break |
|
|
|
|
|
logs.append(f" Found {len(model_cards)} model cards on page {page}") |
|
|
|
|
|
for card in model_cards: |
|
|
if len(models) >= max_results: |
|
|
break |
|
|
|
|
|
try: |
|
|
link = card.find('a', href=True) |
|
|
if link: |
|
|
href = link.get('href', '') |
|
|
model_name = href.lstrip('/') |
|
|
|
|
|
if model_name and model_name != '#' and model_name not in seen_models: |
|
|
seen_models.add(model_name) |
|
|
|
|
|
|
|
|
downloads = 0 |
|
|
likes = 0 |
|
|
size = "" |
|
|
|
|
|
|
|
|
svgs = card.find_all('svg') |
|
|
|
|
|
for svg in svgs: |
|
|
|
|
|
|
|
|
next_elem = svg.find_next_sibling(string=True) |
|
|
stat_text = "" |
|
|
|
|
|
if next_elem and next_elem.strip(): |
|
|
stat_text = next_elem.strip() |
|
|
else: |
|
|
|
|
|
next_tag = svg.find_next_sibling() |
|
|
if next_tag: |
|
|
stat_text = next_tag.get_text(strip=True) |
|
|
|
|
|
if not stat_text or len(stat_text) < 1: |
|
|
continue |
|
|
|
|
|
|
|
|
svg_str = str(svg) |
|
|
|
|
|
|
|
|
if 'M26 24v4H6v-4H4v4a2 2 0 0 0 2 2h20a2 2 0 0 0 2-2v-4zm0-10l-1.41-1.41L17 20.17V2h-2v18.17l-7.59-7.58L6 14l10 10l10-10z' in svg_str: |
|
|
downloads = parse_stat_number(stat_text) |
|
|
|
|
|
|
|
|
elif 'M22.45,6a5.47,5.47,0,0,1,3.91,1.64,5.7,5.7,0,0,1,0,8L16,26.13' in svg_str: |
|
|
likes = parse_stat_number(stat_text) |
|
|
|
|
|
|
|
|
elif 'M10 10H8.4V8.4H10V10Zm0-3.2H8.4V5.2H10v1.6ZM6.8 10H5.2V8.4h1.6V10Z' in svg_str: |
|
|
|
|
|
|
|
|
if len(stat_text) <= 6 and re.search(r'\d+\.?\d*\s*[Bb]', stat_text): |
|
|
size = stat_text |
|
|
|
|
|
models.append({ |
|
|
'name': model_name, |
|
|
'url': f"https://huggingface.co/{model_name}", |
|
|
'downloads': downloads, |
|
|
'likes': likes, |
|
|
'size': size |
|
|
}) |
|
|
except Exception as e: |
|
|
logs.append(f" Error parsing model card: {e}") |
|
|
continue |
|
|
|
|
|
except Exception as e: |
|
|
logs.append(f" ERROR searching page {page}: {e}") |
|
|
break |
|
|
|
|
|
|
|
|
models.sort(key=lambda x: x['downloads'], reverse=True) |
|
|
|
|
|
logs.append(f"Total unique models found: {len(models)}") |
|
|
return models, logs |
|
|
|
|
|
def get_huggingface_stats(item_name, item_type='datasets'): |
|
|
""" |
|
|
Get likes and downloads for a HuggingFace dataset or model using API |
|
|
item_type: 'datasets' or 'models' |
|
|
Returns dict with likes and downloads |
|
|
|
|
|
NOTE: This method is currently NOT USED. We parse stats directly from HTML instead. |
|
|
Keeping it here as a fallback in case HTML parsing fails. |
|
|
""" |
|
|
try: |
|
|
api_url = f"https://huggingface.co/api/{item_type}/{item_name}" |
|
|
response = requests.get(api_url, timeout=5) |
|
|
|
|
|
if response.status_code == 200: |
|
|
data = response.json() |
|
|
return { |
|
|
'likes': data.get('likes', 0), |
|
|
'downloads': data.get('downloads', 0) |
|
|
} |
|
|
except Exception: |
|
|
pass |
|
|
|
|
|
return {'likes': 0, 'downloads': 0} |
|
|
|
|
|
def parse_stat_number(stat_text): |
|
|
""" |
|
|
Parse HuggingFace stat numbers like '4.07M', '23.4k', '349' into integers |
|
|
Returns integer value or 0 if parsing fails |
|
|
""" |
|
|
if not stat_text: |
|
|
return 0 |
|
|
|
|
|
stat_text = stat_text.strip().upper() |
|
|
|
|
|
try: |
|
|
|
|
|
if 'M' in stat_text: |
|
|
return int(float(stat_text.replace('M', '')) * 1_000_000) |
|
|
|
|
|
elif 'K' in stat_text: |
|
|
return int(float(stat_text.replace('K', '')) * 1_000) |
|
|
|
|
|
else: |
|
|
return int(stat_text.replace(',', '')) |
|
|
except (ValueError, AttributeError): |
|
|
return 0 |
|
|
|
|
|
def deduplicate_models(models): |
|
|
""" |
|
|
Deduplicate models by base name (without user/org prefix) |
|
|
Keep the model with most downloads and count duplicates |
|
|
Returns list of deduplicated models with duplicate count added |
|
|
""" |
|
|
from collections import defaultdict |
|
|
|
|
|
|
|
|
grouped = defaultdict(list) |
|
|
for model in models: |
|
|
|
|
|
name_parts = model['name'].split('/') |
|
|
if len(name_parts) > 1: |
|
|
base_name = name_parts[-1] |
|
|
else: |
|
|
base_name = model['name'] |
|
|
|
|
|
grouped[base_name].append(model) |
|
|
|
|
|
|
|
|
deduplicated = [] |
|
|
for base_name, model_list in grouped.items(): |
|
|
|
|
|
model_list.sort(key=lambda x: x['downloads'], reverse=True) |
|
|
best_model = model_list[0] |
|
|
|
|
|
|
|
|
best_model['duplicates'] = len(model_list) - 1 |
|
|
|
|
|
deduplicated.append(best_model) |
|
|
|
|
|
|
|
|
deduplicated.sort(key=lambda x: x['downloads'], reverse=True) |
|
|
|
|
|
return deduplicated |
|
|
|
|
|
def search_huggingface_datasets(language_code, task_category, max_results=100, max_pages=3): |
|
|
""" |
|
|
Search HuggingFace for datasets supporting a specific language |
|
|
task_category: 'automatic-speech-recognition' or 'text-to-speech' |
|
|
max_results: maximum number of datasets to return |
|
|
max_pages: maximum number of pages to search per language code |
|
|
Returns tuple: (list of dataset dictionaries, log messages) |
|
|
""" |
|
|
lang_info = LANGUAGES.get(language_code) |
|
|
logs = [] |
|
|
|
|
|
if not lang_info: |
|
|
logs.append(f"No language info found for code: {language_code}") |
|
|
return [], logs |
|
|
|
|
|
|
|
|
language_codes = set() |
|
|
if lang_info['alpha2']: |
|
|
language_codes.add(lang_info['alpha2']) |
|
|
if lang_info['alpha3_b']: |
|
|
language_codes.add(lang_info['alpha3_b']) |
|
|
if lang_info['alpha3_t']: |
|
|
language_codes.add(lang_info['alpha3_t']) |
|
|
|
|
|
logs.append(f"Language codes to search: {language_codes}") |
|
|
|
|
|
datasets = [] |
|
|
seen_datasets = set() |
|
|
|
|
|
|
|
|
for code in language_codes: |
|
|
if len(datasets) >= max_results: |
|
|
break |
|
|
|
|
|
logs.append(f"Searching for language code: {code}") |
|
|
|
|
|
for page in range(max_pages): |
|
|
if len(datasets) >= max_results: |
|
|
break |
|
|
|
|
|
try: |
|
|
|
|
|
|
|
|
url = f"https://huggingface.co/datasets?task_categories=task_categories:{task_category}&language=language:{code}&sort=trending" |
|
|
if page > 0: |
|
|
url += f"&p={page}" |
|
|
|
|
|
logs.append(f" Page {page}: {url}") |
|
|
|
|
|
headers = { |
|
|
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36' |
|
|
} |
|
|
|
|
|
response = requests.get(url, headers=headers, timeout=10) |
|
|
response.raise_for_status() |
|
|
|
|
|
soup = BeautifulSoup(response.content, 'html.parser') |
|
|
|
|
|
|
|
|
dataset_cards = soup.find_all('article', class_='overview-card-wrapper') |
|
|
|
|
|
if not dataset_cards: |
|
|
logs.append(f" No dataset cards found on page {page}") |
|
|
break |
|
|
|
|
|
logs.append(f" Found {len(dataset_cards)} dataset cards on page {page}") |
|
|
|
|
|
for card in dataset_cards: |
|
|
if len(datasets) >= max_results: |
|
|
break |
|
|
|
|
|
try: |
|
|
link = card.find('a', href=True) |
|
|
if link: |
|
|
href = link.get('href', '') |
|
|
dataset_path = href.lstrip('/') |
|
|
|
|
|
|
|
|
if dataset_path.startswith('datasets/'): |
|
|
dataset_name = dataset_path[9:] |
|
|
else: |
|
|
dataset_name = dataset_path |
|
|
|
|
|
if dataset_name and dataset_name != '#' and dataset_name not in seen_datasets: |
|
|
seen_datasets.add(dataset_name) |
|
|
|
|
|
|
|
|
downloads = 0 |
|
|
likes = 0 |
|
|
size = "" |
|
|
|
|
|
|
|
|
svgs = card.find_all('svg') |
|
|
|
|
|
for svg in svgs: |
|
|
|
|
|
|
|
|
next_elem = svg.find_next_sibling(string=True) |
|
|
stat_text = "" |
|
|
|
|
|
if next_elem and next_elem.strip(): |
|
|
stat_text = next_elem.strip() |
|
|
else: |
|
|
|
|
|
next_tag = svg.find_next_sibling() |
|
|
if next_tag: |
|
|
stat_text = next_tag.get_text(strip=True) |
|
|
|
|
|
|
|
|
if not stat_text or len(stat_text) < 1 or stat_text in ['Viewer', 'Updated']: |
|
|
continue |
|
|
|
|
|
|
|
|
svg_str = str(svg) |
|
|
|
|
|
|
|
|
if 'M26 24v4H6v-4H4v4a2 2 0 0 0 2 2h20a2 2 0 0 0 2-2v-4zm0-10l-1.41-1.41L17 20.17V2h-2v18.17l-7.59-7.58L6 14l10 10l10-10z' in svg_str: |
|
|
downloads = parse_stat_number(stat_text) |
|
|
|
|
|
|
|
|
elif 'M22.45,6a5.47,5.47,0,0,1,3.91,1.64,5.7,5.7,0,0,1,0,8L16,26.13' in svg_str: |
|
|
likes = parse_stat_number(stat_text) |
|
|
|
|
|
|
|
|
elif 'fill-rule="evenodd"' in svg_str and 'clip-rule="evenodd"' in svg_str: |
|
|
|
|
|
|
|
|
if any(c in stat_text for c in ['k', 'K', 'm', 'M']) or stat_text.replace(',', '').replace('.', '').isdigit(): |
|
|
size = stat_text |
|
|
|
|
|
datasets.append({ |
|
|
'name': dataset_name, |
|
|
'url': f"https://huggingface.co/datasets/{dataset_name}", |
|
|
'downloads': downloads, |
|
|
'likes': likes, |
|
|
'size': size |
|
|
}) |
|
|
except Exception as e: |
|
|
logs.append(f" Error parsing dataset card: {e}") |
|
|
continue |
|
|
|
|
|
except Exception as e: |
|
|
logs.append(f" ERROR searching page {page}: {e}") |
|
|
break |
|
|
|
|
|
|
|
|
datasets.sort(key=lambda x: x['downloads'], reverse=True) |
|
|
|
|
|
logs.append(f"Total unique datasets found: {len(datasets)}") |
|
|
return datasets, logs |
|
|
|
|
|
def search_language_resources(language_code, deduplicate=False): |
|
|
""" |
|
|
Search for ASR/TTS resources for a given language |
|
|
Returns results organized by service type |
|
|
deduplicate: if True, remove duplicate models (same base name) and keep only the one with most downloads |
|
|
""" |
|
|
all_logs = [] |
|
|
|
|
|
if not language_code: |
|
|
return None, None, None, 0, 0, None, None, 0, 0, "" |
|
|
|
|
|
lang_info = LANGUAGES.get(language_code) |
|
|
if not lang_info: |
|
|
return None, None, None, 0, 0, None, None, 0, 0, "" |
|
|
|
|
|
language_name = lang_info['name'] |
|
|
all_logs.append(f"=== Searching for {language_name} ({language_code}) ===") |
|
|
all_logs.append(f"Language codes: alpha2={lang_info['alpha2']}, alpha3_b={lang_info['alpha3_b']}, alpha3_t={lang_info['alpha3_t']}") |
|
|
|
|
|
|
|
|
all_logs.append("\n[Azure Speech Services]") |
|
|
azure_asr = fetch_azure_asr_languages() |
|
|
azure_tts = fetch_azure_tts_languages() |
|
|
all_logs.append(f" Fetched {len(azure_asr)} ASR languages and {len(azure_tts)} TTS languages from Azure") |
|
|
|
|
|
|
|
|
azure_locales = get_azure_locales_for_language(language_code) |
|
|
all_logs.append(f" Matching Azure locales: {azure_locales}") |
|
|
|
|
|
|
|
|
azure_asr_locales = [loc for loc in azure_locales if loc in azure_asr] |
|
|
azure_asr_available = len(azure_asr_locales) > 0 |
|
|
all_logs.append(f" Azure ASR: {'✅ Supported' if azure_asr_available else '❌ Not supported'} ({len(azure_asr_locales)} locales)") |
|
|
|
|
|
|
|
|
azure_tts_locales = [loc for loc in azure_locales if loc in azure_tts] |
|
|
azure_tts_available = len(azure_tts_locales) > 0 |
|
|
azure_total_voices = sum(azure_tts[loc]['voice_count'] for loc in azure_tts_locales) |
|
|
all_logs.append(f" Azure TTS: {'✅ Supported' if azure_tts_available else '❌ Not supported'} ({len(azure_tts_locales)} locales, {azure_total_voices} voices)") |
|
|
|
|
|
|
|
|
all_logs.append("\n[Google Cloud Speech]") |
|
|
google_stt = fetch_google_stt_languages() |
|
|
google_tts = fetch_google_tts_languages() |
|
|
all_logs.append(f" Fetched {len(google_stt)} STT languages and {len(google_tts)} TTS languages from Google Cloud") |
|
|
|
|
|
|
|
|
google_locales = get_google_locales_for_language(language_code) |
|
|
all_logs.append(f" Matching Google Cloud locales: {google_locales}") |
|
|
|
|
|
|
|
|
google_stt_locales = [loc for loc in google_locales if loc in google_stt] |
|
|
google_stt_available = len(google_stt_locales) > 0 |
|
|
all_logs.append(f" Google STT: {'✅ Supported' if google_stt_available else '❌ Not supported'} ({len(google_stt_locales)} locales)") |
|
|
|
|
|
|
|
|
google_tts_locales = [loc for loc in google_locales if loc in google_tts] |
|
|
google_tts_available = len(google_tts_locales) > 0 |
|
|
google_total_voices = sum(google_tts[loc]['voice_count'] for loc in google_tts_locales) |
|
|
all_logs.append(f" Google TTS: {'✅ Supported' if google_tts_available else '❌ Not supported'} ({len(google_tts_locales)} locales, {google_total_voices} voices)") |
|
|
|
|
|
|
|
|
all_logs.append("\n[AWS (Transcribe + Polly)]") |
|
|
aws_transcribe = fetch_aws_transcribe_languages() |
|
|
aws_polly = fetch_aws_polly_languages() |
|
|
all_logs.append(f" Fetched {len(aws_transcribe)} Transcribe languages and {len(aws_polly)} Polly languages from AWS") |
|
|
|
|
|
|
|
|
aws_locales = get_aws_locales_for_language(language_code) |
|
|
all_logs.append(f" Matching AWS locales: {aws_locales}") |
|
|
|
|
|
|
|
|
aws_transcribe_locales = [loc for loc in aws_locales if loc in aws_transcribe] |
|
|
aws_transcribe_available = len(aws_transcribe_locales) > 0 |
|
|
all_logs.append(f" AWS Transcribe: {'✅ Supported' if aws_transcribe_available else '❌ Not supported'} ({len(aws_transcribe_locales)} locales)") |
|
|
|
|
|
|
|
|
aws_polly_locales = [loc for loc in aws_locales if loc in aws_polly] |
|
|
aws_polly_available = len(aws_polly_locales) > 0 |
|
|
aws_total_voices = sum(aws_polly[loc]['voice_count'] for loc in aws_polly_locales) |
|
|
all_logs.append(f" AWS Polly: {'✅ Supported' if aws_polly_available else '❌ Not supported'} ({len(aws_polly_locales)} locales, {aws_total_voices} voices)") |
|
|
|
|
|
|
|
|
commercial_rows = [] |
|
|
|
|
|
|
|
|
if azure_asr_available: |
|
|
azure_asr_text = f"✅ {len(azure_asr_locales)} locale(s)" |
|
|
else: |
|
|
azure_asr_text = "❌ N/A" |
|
|
|
|
|
if azure_tts_available: |
|
|
azure_tts_text = f"✅ {len(azure_tts_locales)} locale(s), {azure_total_voices} voice(s)" |
|
|
else: |
|
|
azure_tts_text = "❌ N/A" |
|
|
|
|
|
commercial_rows.append({ |
|
|
"Service": "Azure Speech", |
|
|
"ASR": azure_asr_text, |
|
|
"TTS": azure_tts_text, |
|
|
}) |
|
|
|
|
|
|
|
|
if google_stt_available: |
|
|
google_stt_text = f"✅ {len(google_stt_locales)} locale(s)" |
|
|
else: |
|
|
google_stt_text = "❌ N/A" |
|
|
|
|
|
if google_tts_available: |
|
|
google_tts_text = f"✅ {len(google_tts_locales)} locale(s), {google_total_voices} voice(s)" |
|
|
else: |
|
|
google_tts_text = "❌ N/A" |
|
|
|
|
|
commercial_rows.append({ |
|
|
"Service": "Google Cloud Speech", |
|
|
"ASR": google_stt_text, |
|
|
"TTS": google_tts_text, |
|
|
}) |
|
|
|
|
|
|
|
|
if aws_transcribe_available: |
|
|
aws_transcribe_text = f"✅ {len(aws_transcribe_locales)} locale(s)" |
|
|
else: |
|
|
aws_transcribe_text = "❌ N/A" |
|
|
|
|
|
if aws_polly_available: |
|
|
aws_polly_text = f"✅ {len(aws_polly_locales)} locale(s), {aws_total_voices} voice(s)" |
|
|
else: |
|
|
aws_polly_text = "❌ N/A" |
|
|
|
|
|
commercial_rows.append({ |
|
|
"Service": "AWS (Transcribe + Polly)", |
|
|
"ASR": aws_transcribe_text, |
|
|
"TTS": aws_polly_text, |
|
|
}) |
|
|
|
|
|
|
|
|
all_logs.append("\n[ElevenLabs]") |
|
|
elevenlabs_v2_supported = check_elevenlabs_multilingual_v2_support(language_code) |
|
|
all_logs.append(f" Multilingual v2: {'✅ Supported' if elevenlabs_v2_supported else '❌ Not supported'}") |
|
|
|
|
|
if elevenlabs_v2_supported: |
|
|
elevenlabs_v2_tts_text = "✅ Supported" |
|
|
else: |
|
|
elevenlabs_v2_tts_text = "❌ N/A" |
|
|
|
|
|
commercial_rows.append({ |
|
|
"Service": "ElevenLabs Multilingual v2", |
|
|
"ASR": "N/A", |
|
|
"TTS": elevenlabs_v2_tts_text, |
|
|
}) |
|
|
|
|
|
|
|
|
elevenlabs_v3_supported = check_elevenlabs_turbo_v3_support(language_code) |
|
|
all_logs.append(f" Turbo v3: {'✅ Supported' if elevenlabs_v3_supported else '❌ Not supported'}") |
|
|
|
|
|
if elevenlabs_v3_supported: |
|
|
elevenlabs_v3_tts_text = "✅ Supported" |
|
|
else: |
|
|
elevenlabs_v3_tts_text = "❌ N/A" |
|
|
|
|
|
commercial_rows.append({ |
|
|
"Service": "ElevenLabs Turbo v3", |
|
|
"ASR": "N/A", |
|
|
"TTS": elevenlabs_v3_tts_text, |
|
|
}) |
|
|
|
|
|
commercial_df = pd.DataFrame(commercial_rows) |
|
|
|
|
|
|
|
|
all_logs.append("\n[HuggingFace Models]") |
|
|
|
|
|
asr_models, asr_model_logs = search_huggingface_models(language_code, 'automatic-speech-recognition', max_results=100, max_pages=5) |
|
|
all_logs.extend([f" [ASR] {log}" for log in asr_model_logs]) |
|
|
|
|
|
tts_models, tts_model_logs = search_huggingface_models(language_code, 'text-to-speech', max_results=100, max_pages=5) |
|
|
all_logs.extend([f" [TTS] {log}" for log in tts_model_logs]) |
|
|
|
|
|
|
|
|
if deduplicate: |
|
|
all_logs.append(f"\n[Deduplication]") |
|
|
asr_before = len(asr_models) |
|
|
asr_models = deduplicate_models(asr_models) |
|
|
all_logs.append(f" ASR models: {asr_before} → {len(asr_models)} (removed {asr_before - len(asr_models)} duplicates)") |
|
|
|
|
|
tts_before = len(tts_models) |
|
|
tts_models = deduplicate_models(tts_models) |
|
|
all_logs.append(f" TTS models: {tts_before} → {len(tts_models)} (removed {tts_before - len(tts_models)} duplicates)") |
|
|
else: |
|
|
|
|
|
for model in asr_models: |
|
|
model['duplicates'] = 1 |
|
|
for model in tts_models: |
|
|
model['duplicates'] = 1 |
|
|
|
|
|
|
|
|
asr_models_data = [] |
|
|
for model in asr_models: |
|
|
asr_models_data.append({ |
|
|
"Model Name": f"[{model['name']}]({model['url']})", |
|
|
"Downloads": model['downloads'], |
|
|
"Likes": model['likes'], |
|
|
"Size": model.get('size', ''), |
|
|
"Duplicates": model.get('duplicates', 1) |
|
|
}) |
|
|
|
|
|
if asr_models_data: |
|
|
asr_models_df = pd.DataFrame(asr_models_data) |
|
|
else: |
|
|
|
|
|
asr_models_df = pd.DataFrame(columns=["Model Name", "Downloads", "Likes", "Size", "Duplicates"]) |
|
|
|
|
|
|
|
|
tts_models_data = [] |
|
|
for model in tts_models: |
|
|
tts_models_data.append({ |
|
|
"Model Name": f"[{model['name']}]({model['url']})", |
|
|
"Downloads": model['downloads'], |
|
|
"Likes": model['likes'], |
|
|
"Size": model.get('size', ''), |
|
|
"Duplicates": model.get('duplicates', 1) |
|
|
}) |
|
|
|
|
|
if tts_models_data: |
|
|
tts_models_df = pd.DataFrame(tts_models_data) |
|
|
else: |
|
|
|
|
|
tts_models_df = pd.DataFrame(columns=["Model Name", "Downloads", "Likes", "Size", "Duplicates"]) |
|
|
|
|
|
|
|
|
all_logs.append("\n[HuggingFace Datasets]") |
|
|
asr_datasets, asr_dataset_logs = search_huggingface_datasets(language_code, 'automatic-speech-recognition', max_results=100, max_pages=5) |
|
|
all_logs.extend([f" [ASR] {log}" for log in asr_dataset_logs]) |
|
|
|
|
|
tts_datasets, tts_dataset_logs = search_huggingface_datasets(language_code, 'text-to-speech', max_results=100, max_pages=5) |
|
|
all_logs.extend([f" [TTS] {log}" for log in tts_dataset_logs]) |
|
|
|
|
|
|
|
|
asr_datasets_data = [] |
|
|
for dataset in asr_datasets: |
|
|
asr_datasets_data.append({ |
|
|
"Dataset Name": f"[{dataset['name']}]({dataset['url']})", |
|
|
"Downloads": dataset['downloads'], |
|
|
"Likes": dataset['likes'], |
|
|
"Size": dataset.get('size', '') |
|
|
}) |
|
|
|
|
|
if asr_datasets_data: |
|
|
asr_datasets_df = pd.DataFrame(asr_datasets_data) |
|
|
else: |
|
|
|
|
|
asr_datasets_df = pd.DataFrame(columns=["Dataset Name", "Downloads", "Likes", "Size"]) |
|
|
|
|
|
|
|
|
tts_datasets_data = [] |
|
|
for dataset in tts_datasets: |
|
|
tts_datasets_data.append({ |
|
|
"Dataset Name": f"[{dataset['name']}]({dataset['url']})", |
|
|
"Downloads": dataset['downloads'], |
|
|
"Likes": dataset['likes'], |
|
|
"Size": dataset.get('size', '') |
|
|
}) |
|
|
|
|
|
if tts_datasets_data: |
|
|
tts_datasets_df = pd.DataFrame(tts_datasets_data) |
|
|
else: |
|
|
|
|
|
tts_datasets_df = pd.DataFrame(columns=["Dataset Name", "Downloads", "Likes", "Size"]) |
|
|
|
|
|
|
|
|
log_text = "\n".join(all_logs) |
|
|
|
|
|
|
|
|
return commercial_df, asr_models_df, tts_models_df, len(asr_models), len(tts_models), asr_datasets_df, tts_datasets_df, len(asr_datasets), len(tts_datasets), log_text |
|
|
|
|
|
|
|
|
print("Initializing Speech Resource Finder...") |
|
|
load_app_content() |
|
|
load_language_list() |
|
|
load_language_taxonomy() |
|
|
|
|
|
|
|
|
language_choices = [f"{code}: {info['name']}" for code, info in sorted(LANGUAGES.items(), key=lambda x: x[1]['name'])] |
|
|
print(f"Created dropdown with {len(language_choices)} language options") |
|
|
|
|
|
with gr.Blocks(title=APP_CONTENT["title"]) as demo: |
|
|
gr.Markdown(f"# {APP_CONTENT['title']}") |
|
|
gr.Markdown(APP_CONTENT["description"]) |
|
|
|
|
|
with gr.Row(equal_height=True): |
|
|
with gr.Column(scale=2): |
|
|
language_dropdown = gr.Dropdown( |
|
|
choices=language_choices, |
|
|
label="Select Language", |
|
|
info="Type to search for a language", |
|
|
allow_custom_value=False, |
|
|
filterable=True, |
|
|
) |
|
|
with gr.Column(scale=1): |
|
|
language_metadata = gr.HTML( |
|
|
"""<div style='padding: 15px; border: 2px solid #e0e0e0; border-radius: 4px; background-color: #fafafa; height: 100%; display: flex; align-items: center; justify-content: center; box-sizing: border-box;'> |
|
|
<p style='margin: 0; color: #333; font-size: 14px;'>Select a language to see resource classification</p> |
|
|
</div>""", |
|
|
elem_id="language-metadata" |
|
|
) |
|
|
|
|
|
gr.Markdown("## Commercial Services") |
|
|
commercial_table = gr.Dataframe( |
|
|
headers=["Service", "ASR", "TTS"], |
|
|
interactive=False, |
|
|
wrap=True, |
|
|
) |
|
|
|
|
|
gr.Markdown("## HuggingFace Models") |
|
|
|
|
|
with gr.Row(): |
|
|
deduplicate_checkbox = gr.Checkbox( |
|
|
label="Deduplicate models", |
|
|
value=True, |
|
|
info="Keep only the model with most downloads for each base name" |
|
|
) |
|
|
|
|
|
|
|
|
with gr.Tabs(): |
|
|
with gr.Tab(label="ASR Models") as asr_tab: |
|
|
asr_count_label = gr.Markdown("*Loading...*") |
|
|
asr_models_table = gr.Dataframe( |
|
|
headers=["Model Name", "Downloads", "Likes", "Size", "Duplicates"], |
|
|
interactive=False, |
|
|
wrap=True, |
|
|
datatype=["markdown", "number", "number", "str", "number"], |
|
|
) |
|
|
|
|
|
with gr.Tab(label="TTS Models") as tts_tab: |
|
|
tts_count_label = gr.Markdown("*Loading...*") |
|
|
tts_models_table = gr.Dataframe( |
|
|
headers=["Model Name", "Downloads", "Likes", "Size", "Duplicates"], |
|
|
interactive=False, |
|
|
wrap=True, |
|
|
datatype=["markdown", "number", "number", "str", "number"], |
|
|
) |
|
|
|
|
|
gr.Markdown("## HuggingFace Datasets") |
|
|
|
|
|
|
|
|
with gr.Tabs(): |
|
|
with gr.Tab(label="ASR Datasets") as asr_datasets_tab: |
|
|
asr_datasets_count_label = gr.Markdown("*Loading...*") |
|
|
asr_datasets_table = gr.Dataframe( |
|
|
headers=["Dataset Name", "Downloads", "Likes", "Size"], |
|
|
interactive=False, |
|
|
wrap=True, |
|
|
datatype=["markdown", "number", "number", "str"], |
|
|
) |
|
|
|
|
|
with gr.Tab(label="TTS Datasets") as tts_datasets_tab: |
|
|
tts_datasets_count_label = gr.Markdown("*Loading...*") |
|
|
tts_datasets_table = gr.Dataframe( |
|
|
headers=["Dataset Name", "Downloads", "Likes", "Size"], |
|
|
interactive=False, |
|
|
wrap=True, |
|
|
datatype=["markdown", "number", "number", "str"], |
|
|
) |
|
|
|
|
|
with gr.Accordion("Logs", open=False): |
|
|
log_textbox = gr.Textbox( |
|
|
show_label=False, |
|
|
lines=15, |
|
|
max_lines=30, |
|
|
interactive=False, |
|
|
placeholder="Logs will appear here...", |
|
|
autoscroll=True, |
|
|
) |
|
|
|
|
|
|
|
|
with gr.Accordion("About this tool", open=False): |
|
|
gr.Markdown(APP_CONTENT["full_content"]) |
|
|
|
|
|
def on_search(language_selection, deduplicate): |
|
|
if not language_selection: |
|
|
default_html = """<div style='padding: 15px; border: 2px solid #e0e0e0; border-radius: 4px; background-color: #fafafa; height: 100%; display: flex; align-items: center; justify-content: center; box-sizing: border-box;'> |
|
|
<p style='margin: 0; color: #333; font-size: 14px;'>Select a language to see resource classification</p> |
|
|
</div>""" |
|
|
return default_html, None, "", None, "", None, "", None, "", None, "" |
|
|
|
|
|
language_code = language_selection.split(":")[0].strip() |
|
|
|
|
|
|
|
|
language_name = LANGUAGES.get(language_code, {}).get("name", "") |
|
|
|
|
|
|
|
|
level, classification = get_language_taxonomy_info(language_name) |
|
|
|
|
|
|
|
|
if level is not None: |
|
|
color = get_taxonomy_color(level) |
|
|
metadata_html = f"""<div style='padding: 15px; border: 2px solid {color}; border-radius: 4px; background-color: #ffffff; height: 100%; box-sizing: border-box; display: flex; flex-direction: column; justify-content: center;'> |
|
|
<h4 style='margin: 0 0 8px 0; color: #333; font-size: 16px;'>{language_name}</h4> |
|
|
<div style='margin: 8px 0;'> |
|
|
<span style='padding: 6px 12px; background-color: {color}; color: white; border-radius: 4px; font-weight: bold; font-size: 13px;'>{classification}</span> |
|
|
</div> |
|
|
<p style='margin: 8px 0 0 0; font-size: 11px; color: #555;'>Source: <a href='https://microsoft.github.io/linguisticdiversity/' target='_blank' style='color: #1976d2; text-decoration: none;'>Joshi et al.</a></p> |
|
|
</div>""" |
|
|
else: |
|
|
metadata_html = f"""<div style='padding: 15px; border: 2px solid #757575; border-radius: 4px; background-color: #ffffff; height: 100%; box-sizing: border-box; display: flex; flex-direction: column; justify-content: center;'> |
|
|
<h4 style='margin: 0 0 8px 0; color: #333; font-size: 16px;'>{language_name}</h4> |
|
|
<div style='margin: 8px 0;'> |
|
|
<span style='padding: 6px 12px; background-color: #757575; color: white; border-radius: 4px; font-weight: bold; font-size: 13px;'>Unknown</span> |
|
|
</div> |
|
|
<p style='margin: 8px 0 0 0; font-size: 11px; color: #555;'>Source: <a href='https://microsoft.github.io/linguisticdiversity/' target='_blank' style='color: #1976d2; text-decoration: none;'>Joshi et al.</a></p> |
|
|
</div>""" |
|
|
|
|
|
commercial_df, asr_models_df, tts_models_df, asr_models_count, tts_models_count, asr_datasets_df, tts_datasets_df, asr_datasets_count, tts_datasets_count, logs = search_language_resources(language_code, deduplicate=deduplicate) |
|
|
|
|
|
|
|
|
asr_models_label = f"**Found {asr_models_count} ASR model(s)**" |
|
|
tts_models_label = f"**Found {tts_models_count} TTS model(s)**" |
|
|
asr_datasets_label = f"**Found {asr_datasets_count} ASR dataset(s)**" |
|
|
tts_datasets_label = f"**Found {tts_datasets_count} TTS dataset(s)**" |
|
|
|
|
|
return metadata_html, commercial_df, asr_models_label, asr_models_df, tts_models_label, tts_models_df, asr_datasets_label, asr_datasets_df, tts_datasets_label, tts_datasets_df, logs |
|
|
|
|
|
|
|
|
language_dropdown.change( |
|
|
fn=on_search, |
|
|
inputs=[language_dropdown, deduplicate_checkbox], |
|
|
outputs=[language_metadata, commercial_table, asr_count_label, asr_models_table, tts_count_label, tts_models_table, asr_datasets_count_label, asr_datasets_table, tts_datasets_count_label, tts_datasets_table, log_textbox], |
|
|
) |
|
|
|
|
|
|
|
|
deduplicate_checkbox.change( |
|
|
fn=on_search, |
|
|
inputs=[language_dropdown, deduplicate_checkbox], |
|
|
outputs=[language_metadata, commercial_table, asr_count_label, asr_models_table, tts_count_label, tts_models_table, asr_datasets_count_label, asr_datasets_table, tts_datasets_count_label, tts_datasets_table, log_textbox], |
|
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch(server_name="0.0.0.0", server_port=7860, share=False, show_error=True) |