Alp
commited on
Commit
·
a85d2eb
1
Parent(s):
f52126a
CV info, full 7925 langs
Browse files- app.py +168 -41
- app_content.md +15 -2
- cv-corpus-20.0-2024-12-06.json +0 -0
- language-codes-full.csv +0 -0
app.py
CHANGED
|
@@ -6,20 +6,27 @@ from functools import lru_cache
|
|
| 6 |
import csv
|
| 7 |
from io import StringIO
|
| 8 |
import re
|
|
|
|
| 9 |
|
| 10 |
# Configuration
|
| 11 |
LANGUAGE_CODES_FILE = "language-codes-full.csv"
|
| 12 |
APP_CONTENT_FILE = "app_content.md"
|
| 13 |
LANGUAGE_TAXONOMY_URL = "https://microsoft.github.io/linguisticdiversity/assets/lang2tax.txt"
|
|
|
|
|
|
|
| 14 |
|
| 15 |
# Language list will be loaded from CSV
|
| 16 |
-
# Structure: {
|
| 17 |
LANGUAGES = {}
|
| 18 |
|
| 19 |
# Language taxonomy mapping (from Joshi et al.'s linguistic diversity paper)
|
| 20 |
# Structure: {language_name_lowercase: level}
|
| 21 |
LANGUAGE_TAXONOMY = {}
|
| 22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
# Taxonomy level descriptions
|
| 24 |
TAXONOMY_LEVELS = {
|
| 25 |
0: "The Left-Behinds",
|
|
@@ -89,32 +96,27 @@ def load_language_list(csv_path=None):
|
|
| 89 |
with open(csv_path, 'r', encoding='utf-8') as f:
|
| 90 |
reader = csv.DictReader(f)
|
| 91 |
for row in reader:
|
| 92 |
-
#
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
if primary_code and name:
|
| 101 |
-
LANGUAGES[primary_code] = {
|
| 102 |
"name": name,
|
| 103 |
-
"
|
| 104 |
-
"
|
| 105 |
-
|
|
|
|
|
|
|
|
|
|
| 106 |
}
|
| 107 |
print(f"Loaded {len(LANGUAGES)} languages from {csv_path}")
|
| 108 |
except Exception as e:
|
| 109 |
-
print(f"
|
| 110 |
-
|
| 111 |
-
LANGUAGES = {
|
| 112 |
-
"eng": {"name": "English", "alpha3_b": "eng", "alpha3_t": "", "alpha2": "en"},
|
| 113 |
-
"spa": {"name": "Spanish", "alpha3_b": "spa", "alpha3_t": "", "alpha2": "es"},
|
| 114 |
-
"fra": {"name": "French", "alpha3_b": "fra", "alpha3_t": "", "alpha2": "fr"},
|
| 115 |
-
"deu": {"name": "German", "alpha3_b": "ger", "alpha3_t": "deu", "alpha2": "de"},
|
| 116 |
-
}
|
| 117 |
-
print(f"Using fallback with {len(LANGUAGES)} languages")
|
| 118 |
|
| 119 |
def load_language_taxonomy():
|
| 120 |
"""Load language taxonomy data from Microsoft's linguistic diversity project"""
|
|
@@ -138,6 +140,67 @@ def load_language_taxonomy():
|
|
| 138 |
print(f"Warning: Could not load language taxonomy: {e}")
|
| 139 |
print("Language classification will show as 'Unknown'")
|
| 140 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 141 |
def get_taxonomy_color(level):
|
| 142 |
"""
|
| 143 |
Get color code for taxonomy level (red for left-behind, green for winners)
|
|
@@ -987,15 +1050,25 @@ def search_language_resources(language_code, deduplicate=False):
|
|
| 987 |
all_logs = []
|
| 988 |
|
| 989 |
if not language_code:
|
| 990 |
-
return None, None, None, 0, 0, None, None, 0, 0, ""
|
| 991 |
|
| 992 |
lang_info = LANGUAGES.get(language_code)
|
| 993 |
if not lang_info:
|
| 994 |
-
return None, None, None, 0, 0, None, None, 0, 0, ""
|
| 995 |
|
| 996 |
language_name = lang_info['name']
|
| 997 |
all_logs.append(f"=== Searching for {language_name} ({language_code}) ===")
|
| 998 |
-
all_logs.append(f"Language codes:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 999 |
|
| 1000 |
# Fetch Azure data
|
| 1001 |
all_logs.append("\n[Azure Speech Services]")
|
|
@@ -1250,14 +1323,15 @@ def search_language_resources(language_code, deduplicate=False):
|
|
| 1250 |
# Combine all logs
|
| 1251 |
log_text = "\n".join(all_logs)
|
| 1252 |
|
| 1253 |
-
# Return
|
| 1254 |
-
return commercial_df, asr_models_df, tts_models_df, len(asr_models), len(tts_models), asr_datasets_df, tts_datasets_df, len(asr_datasets), len(tts_datasets), log_text
|
| 1255 |
|
| 1256 |
# Initialize - load language list and app content
|
| 1257 |
print("Initializing Speech Resource Finder...")
|
| 1258 |
load_app_content()
|
| 1259 |
load_language_list()
|
| 1260 |
load_language_taxonomy()
|
|
|
|
| 1261 |
|
| 1262 |
# Create language choices for dropdown (code: name format for easy searching)
|
| 1263 |
language_choices = [f"{code}: {info['name']}" for code, info in sorted(LANGUAGES.items(), key=lambda x: x[1]['name'])]
|
|
@@ -1268,7 +1342,7 @@ with gr.Blocks(title=APP_CONTENT["title"]) as demo:
|
|
| 1268 |
gr.Markdown(APP_CONTENT["description"])
|
| 1269 |
|
| 1270 |
with gr.Row(equal_height=True):
|
| 1271 |
-
with gr.Column(scale=
|
| 1272 |
language_dropdown = gr.Dropdown(
|
| 1273 |
choices=language_choices,
|
| 1274 |
label="Select Language",
|
|
@@ -1276,7 +1350,7 @@ with gr.Blocks(title=APP_CONTENT["title"]) as demo:
|
|
| 1276 |
allow_custom_value=False,
|
| 1277 |
filterable=True,
|
| 1278 |
)
|
| 1279 |
-
with gr.Column(scale=
|
| 1280 |
language_metadata = gr.HTML(
|
| 1281 |
"""<div style='padding: 15px; border: 2px solid #e0e0e0; border-radius: 4px; background-color: #fafafa; height: 100%; display: flex; align-items: center; justify-content: center; box-sizing: border-box;'>
|
| 1282 |
<p style='margin: 0; color: #333; font-size: 14px;'>Select a language to see resource classification</p>
|
|
@@ -1284,12 +1358,23 @@ with gr.Blocks(title=APP_CONTENT["title"]) as demo:
|
|
| 1284 |
elem_id="language-metadata"
|
| 1285 |
)
|
| 1286 |
|
| 1287 |
-
gr.
|
| 1288 |
-
|
| 1289 |
-
|
| 1290 |
-
|
| 1291 |
-
|
| 1292 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1293 |
|
| 1294 |
gr.Markdown("## HuggingFace Models")
|
| 1295 |
|
|
@@ -1361,7 +1446,10 @@ with gr.Blocks(title=APP_CONTENT["title"]) as demo:
|
|
| 1361 |
default_html = """<div style='padding: 15px; border: 2px solid #e0e0e0; border-radius: 4px; background-color: #fafafa; height: 100%; display: flex; align-items: center; justify-content: center; box-sizing: border-box;'>
|
| 1362 |
<p style='margin: 0; color: #333; font-size: 14px;'>Select a language to see resource classification</p>
|
| 1363 |
</div>"""
|
| 1364 |
-
|
|
|
|
|
|
|
|
|
|
| 1365 |
# Extract the language code from "code: name" format
|
| 1366 |
language_code = language_selection.split(":")[0].strip()
|
| 1367 |
|
|
@@ -1390,7 +1478,46 @@ with gr.Blocks(title=APP_CONTENT["title"]) as demo:
|
|
| 1390 |
<p style='margin: 8px 0 0 0; font-size: 11px; color: #555;'>Source: <a href='https://microsoft.github.io/linguisticdiversity/' target='_blank' style='color: #1976d2; text-decoration: none;'>Joshi et al.</a></p>
|
| 1391 |
</div>"""
|
| 1392 |
|
| 1393 |
-
commercial_df, asr_models_df, tts_models_df, asr_models_count, tts_models_count, asr_datasets_df, tts_datasets_df, asr_datasets_count, tts_datasets_count, logs = search_language_resources(language_code, deduplicate=deduplicate)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1394 |
|
| 1395 |
# Create count labels
|
| 1396 |
asr_models_label = f"**Found {asr_models_count} ASR model(s)**"
|
|
@@ -1398,20 +1525,20 @@ with gr.Blocks(title=APP_CONTENT["title"]) as demo:
|
|
| 1398 |
asr_datasets_label = f"**Found {asr_datasets_count} ASR dataset(s)**"
|
| 1399 |
tts_datasets_label = f"**Found {tts_datasets_count} TTS dataset(s)**"
|
| 1400 |
|
| 1401 |
-
return metadata_html, commercial_df, asr_models_label, asr_models_df, tts_models_label, tts_models_df, asr_datasets_label, asr_datasets_df, tts_datasets_label, tts_datasets_df, logs
|
| 1402 |
|
| 1403 |
# Trigger search when language is selected
|
| 1404 |
language_dropdown.change(
|
| 1405 |
fn=on_search,
|
| 1406 |
inputs=[language_dropdown, deduplicate_checkbox],
|
| 1407 |
-
outputs=[language_metadata, commercial_table, asr_count_label, asr_models_table, tts_count_label, tts_models_table, asr_datasets_count_label, asr_datasets_table, tts_datasets_count_label, tts_datasets_table, log_textbox],
|
| 1408 |
)
|
| 1409 |
|
| 1410 |
# Trigger search when deduplicate checkbox is changed
|
| 1411 |
deduplicate_checkbox.change(
|
| 1412 |
fn=on_search,
|
| 1413 |
inputs=[language_dropdown, deduplicate_checkbox],
|
| 1414 |
-
outputs=[language_metadata, commercial_table, asr_count_label, asr_models_table, tts_count_label, tts_models_table, asr_datasets_count_label, asr_datasets_table, tts_datasets_count_label, tts_datasets_table, log_textbox],
|
| 1415 |
)
|
| 1416 |
|
| 1417 |
if __name__ == "__main__":
|
|
|
|
| 6 |
import csv
|
| 7 |
from io import StringIO
|
| 8 |
import re
|
| 9 |
+
import json
|
| 10 |
|
| 11 |
# Configuration
|
| 12 |
LANGUAGE_CODES_FILE = "language-codes-full.csv"
|
| 13 |
APP_CONTENT_FILE = "app_content.md"
|
| 14 |
LANGUAGE_TAXONOMY_URL = "https://microsoft.github.io/linguisticdiversity/assets/lang2tax.txt"
|
| 15 |
+
COMMON_VOICE_DATA_FILE = "cv-corpus-20.0-2024-12-06.json"
|
| 16 |
+
COMMON_VOICE_VERSION = "20.0 (2024-12-06)"
|
| 17 |
|
| 18 |
# Language list will be loaded from CSV
|
| 19 |
+
# Structure: {iso_639_2: {"name": str, "iso_639_1": str, "french_name": str}}
|
| 20 |
LANGUAGES = {}
|
| 21 |
|
| 22 |
# Language taxonomy mapping (from Joshi et al.'s linguistic diversity paper)
|
| 23 |
# Structure: {language_name_lowercase: level}
|
| 24 |
LANGUAGE_TAXONOMY = {}
|
| 25 |
|
| 26 |
+
# Common Voice dataset
|
| 27 |
+
# Structure: {locale_code: {validHrs: float, totalHrs: float, splits: {gender: {...}}, ...}}
|
| 28 |
+
COMMON_VOICE_DATA = {}
|
| 29 |
+
|
| 30 |
# Taxonomy level descriptions
|
| 31 |
TAXONOMY_LEVELS = {
|
| 32 |
0: "The Left-Behinds",
|
|
|
|
| 96 |
with open(csv_path, 'r', encoding='utf-8') as f:
|
| 97 |
reader = csv.DictReader(f)
|
| 98 |
for row in reader:
|
| 99 |
+
# New CSV structure: ISO 639-2, ISO 639-1, English name, French name
|
| 100 |
+
iso_639_2 = row['ISO 639-2'].strip()
|
| 101 |
+
iso_639_1 = row['ISO 639-1'].strip()
|
| 102 |
+
name = row['English name'].strip()
|
| 103 |
+
french_name = row['French name'].strip()
|
| 104 |
+
|
| 105 |
+
if iso_639_2 and name:
|
| 106 |
+
LANGUAGES[iso_639_2] = {
|
|
|
|
|
|
|
| 107 |
"name": name,
|
| 108 |
+
"iso_639_1": iso_639_1,
|
| 109 |
+
"french_name": french_name,
|
| 110 |
+
# Keep legacy field names for backward compatibility
|
| 111 |
+
"alpha2": iso_639_1,
|
| 112 |
+
"alpha3_b": iso_639_2,
|
| 113 |
+
"alpha3_t": "" # Not used in new format
|
| 114 |
}
|
| 115 |
print(f"Loaded {len(LANGUAGES)} languages from {csv_path}")
|
| 116 |
except Exception as e:
|
| 117 |
+
print(f"ERROR: Failed to load language list from {csv_path}: {e}")
|
| 118 |
+
print("The application cannot run without the language codes CSV file.")
|
| 119 |
+
LANGUAGES = {}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 120 |
|
| 121 |
def load_language_taxonomy():
|
| 122 |
"""Load language taxonomy data from Microsoft's linguistic diversity project"""
|
|
|
|
| 140 |
print(f"Warning: Could not load language taxonomy: {e}")
|
| 141 |
print("Language classification will show as 'Unknown'")
|
| 142 |
|
| 143 |
+
def load_common_voice_data():
|
| 144 |
+
"""Load Common Voice dataset statistics"""
|
| 145 |
+
global COMMON_VOICE_DATA
|
| 146 |
+
|
| 147 |
+
try:
|
| 148 |
+
with open(COMMON_VOICE_DATA_FILE, 'r', encoding='utf-8') as f:
|
| 149 |
+
data = json.load(f)
|
| 150 |
+
COMMON_VOICE_DATA = data.get('locales', {})
|
| 151 |
+
|
| 152 |
+
print(f"Loaded Common Voice data for {len(COMMON_VOICE_DATA)} languages")
|
| 153 |
+
except Exception as e:
|
| 154 |
+
print(f"Warning: Could not load Common Voice data: {e}")
|
| 155 |
+
print("Common Voice statistics will not be available")
|
| 156 |
+
|
| 157 |
+
def get_common_voice_stats(language_code):
|
| 158 |
+
"""
|
| 159 |
+
Get Common Voice statistics for a language
|
| 160 |
+
Returns dict with validHrs, totalHrs, gender balance, and locale code or None if not available
|
| 161 |
+
"""
|
| 162 |
+
lang_info = LANGUAGES.get(language_code)
|
| 163 |
+
if not lang_info:
|
| 164 |
+
return None
|
| 165 |
+
|
| 166 |
+
# Try to find CV data using different code formats
|
| 167 |
+
# 1. Try ISO 639-2 (3-letter) code directly (e.g., "zgh", "kab")
|
| 168 |
+
if language_code in COMMON_VOICE_DATA:
|
| 169 |
+
cv_locale = language_code
|
| 170 |
+
cv_data = COMMON_VOICE_DATA[language_code]
|
| 171 |
+
# 2. Try ISO 639-1 (2-letter) code (e.g., "en", "fr")
|
| 172 |
+
elif lang_info.get('iso_639_1') and lang_info['iso_639_1'] in COMMON_VOICE_DATA:
|
| 173 |
+
cv_locale = lang_info['iso_639_1']
|
| 174 |
+
cv_data = COMMON_VOICE_DATA[lang_info['iso_639_1']]
|
| 175 |
+
# 3. Try to find any locale that starts with the 2-letter code (e.g., "fy-NL", "ga-IE")
|
| 176 |
+
elif lang_info.get('iso_639_1'):
|
| 177 |
+
iso_639_1 = lang_info['iso_639_1']
|
| 178 |
+
matching_locales = [loc for loc in COMMON_VOICE_DATA.keys() if loc.startswith(iso_639_1 + '-')]
|
| 179 |
+
if matching_locales:
|
| 180 |
+
cv_locale = matching_locales[0] # Take the first match
|
| 181 |
+
cv_data = COMMON_VOICE_DATA[cv_locale]
|
| 182 |
+
else:
|
| 183 |
+
return None
|
| 184 |
+
else:
|
| 185 |
+
return None
|
| 186 |
+
|
| 187 |
+
# Extract statistics
|
| 188 |
+
valid_hrs = cv_data.get('validHrs', 0)
|
| 189 |
+
total_hrs = cv_data.get('totalHrs', 0)
|
| 190 |
+
|
| 191 |
+
# Extract gender balance
|
| 192 |
+
gender_splits = cv_data.get('splits', {}).get('gender', {})
|
| 193 |
+
male_pct = gender_splits.get('male_masculine', 0) * 100
|
| 194 |
+
female_pct = gender_splits.get('female_feminine', 0) * 100
|
| 195 |
+
|
| 196 |
+
return {
|
| 197 |
+
'locale': cv_locale,
|
| 198 |
+
'valid_hrs': valid_hrs,
|
| 199 |
+
'total_hrs': total_hrs,
|
| 200 |
+
'male_pct': male_pct,
|
| 201 |
+
'female_pct': female_pct
|
| 202 |
+
}
|
| 203 |
+
|
| 204 |
def get_taxonomy_color(level):
|
| 205 |
"""
|
| 206 |
Get color code for taxonomy level (red for left-behind, green for winners)
|
|
|
|
| 1050 |
all_logs = []
|
| 1051 |
|
| 1052 |
if not language_code:
|
| 1053 |
+
return None, None, None, None, 0, 0, None, None, 0, 0, ""
|
| 1054 |
|
| 1055 |
lang_info = LANGUAGES.get(language_code)
|
| 1056 |
if not lang_info:
|
| 1057 |
+
return None, None, None, None, 0, 0, None, None, 0, 0, ""
|
| 1058 |
|
| 1059 |
language_name = lang_info['name']
|
| 1060 |
all_logs.append(f"=== Searching for {language_name} ({language_code}) ===")
|
| 1061 |
+
all_logs.append(f"Language codes: ISO 639-1={lang_info['iso_639_1']}, ISO 639-2={language_code}")
|
| 1062 |
+
|
| 1063 |
+
# Check Common Voice data
|
| 1064 |
+
all_logs.append("\n[Common Voice Dataset]")
|
| 1065 |
+
cv_stats = get_common_voice_stats(language_code)
|
| 1066 |
+
if cv_stats:
|
| 1067 |
+
all_logs.append(f" ✅ Available in Common Voice (locale: {cv_stats['locale']})")
|
| 1068 |
+
all_logs.append(f" Valid hours: {cv_stats['valid_hrs']:.1f}h, Total hours: {cv_stats['total_hrs']:.1f}h")
|
| 1069 |
+
all_logs.append(f" Gender balance: {cv_stats['male_pct']:.1f}% male, {cv_stats['female_pct']:.1f}% female")
|
| 1070 |
+
else:
|
| 1071 |
+
all_logs.append(f" ❌ Not available in Common Voice")
|
| 1072 |
|
| 1073 |
# Fetch Azure data
|
| 1074 |
all_logs.append("\n[Azure Speech Services]")
|
|
|
|
| 1323 |
# Combine all logs
|
| 1324 |
log_text = "\n".join(all_logs)
|
| 1325 |
|
| 1326 |
+
# Return CV stats, commercial services, models, datasets, and logs
|
| 1327 |
+
return cv_stats, commercial_df, asr_models_df, tts_models_df, len(asr_models), len(tts_models), asr_datasets_df, tts_datasets_df, len(asr_datasets), len(tts_datasets), log_text
|
| 1328 |
|
| 1329 |
# Initialize - load language list and app content
|
| 1330 |
print("Initializing Speech Resource Finder...")
|
| 1331 |
load_app_content()
|
| 1332 |
load_language_list()
|
| 1333 |
load_language_taxonomy()
|
| 1334 |
+
load_common_voice_data()
|
| 1335 |
|
| 1336 |
# Create language choices for dropdown (code: name format for easy searching)
|
| 1337 |
language_choices = [f"{code}: {info['name']}" for code, info in sorted(LANGUAGES.items(), key=lambda x: x[1]['name'])]
|
|
|
|
| 1342 |
gr.Markdown(APP_CONTENT["description"])
|
| 1343 |
|
| 1344 |
with gr.Row(equal_height=True):
|
| 1345 |
+
with gr.Column(scale=70):
|
| 1346 |
language_dropdown = gr.Dropdown(
|
| 1347 |
choices=language_choices,
|
| 1348 |
label="Select Language",
|
|
|
|
| 1350 |
allow_custom_value=False,
|
| 1351 |
filterable=True,
|
| 1352 |
)
|
| 1353 |
+
with gr.Column(scale=30):
|
| 1354 |
language_metadata = gr.HTML(
|
| 1355 |
"""<div style='padding: 15px; border: 2px solid #e0e0e0; border-radius: 4px; background-color: #fafafa; height: 100%; display: flex; align-items: center; justify-content: center; box-sizing: border-box;'>
|
| 1356 |
<p style='margin: 0; color: #333; font-size: 14px;'>Select a language to see resource classification</p>
|
|
|
|
| 1358 |
elem_id="language-metadata"
|
| 1359 |
)
|
| 1360 |
|
| 1361 |
+
with gr.Row():
|
| 1362 |
+
with gr.Column(scale=70):
|
| 1363 |
+
gr.Markdown("## Commercial Services")
|
| 1364 |
+
commercial_table = gr.Dataframe(
|
| 1365 |
+
headers=["Service", "ASR", "TTS"],
|
| 1366 |
+
interactive=False,
|
| 1367 |
+
wrap=True,
|
| 1368 |
+
)
|
| 1369 |
+
|
| 1370 |
+
with gr.Column(scale=30):
|
| 1371 |
+
gr.Markdown("## Common Voice")
|
| 1372 |
+
cv_info = gr.HTML(
|
| 1373 |
+
"""<div style='padding: 15px; border: 2px solid #e0e0e0; border-radius: 4px; background-color: #fafafa;'>
|
| 1374 |
+
<p style='margin: 0; color: #666; font-size: 13px;'>Select a language</p>
|
| 1375 |
+
</div>""",
|
| 1376 |
+
elem_id="cv-info"
|
| 1377 |
+
)
|
| 1378 |
|
| 1379 |
gr.Markdown("## HuggingFace Models")
|
| 1380 |
|
|
|
|
| 1446 |
default_html = """<div style='padding: 15px; border: 2px solid #e0e0e0; border-radius: 4px; background-color: #fafafa; height: 100%; display: flex; align-items: center; justify-content: center; box-sizing: border-box;'>
|
| 1447 |
<p style='margin: 0; color: #333; font-size: 14px;'>Select a language to see resource classification</p>
|
| 1448 |
</div>"""
|
| 1449 |
+
cv_default_html = """<div style='padding: 15px; border: 2px solid #e0e0e0; border-radius: 4px; background-color: #fafafa;'>
|
| 1450 |
+
<p style='margin: 0; color: #666; font-size: 13px;'>Select a language</p>
|
| 1451 |
+
</div>"""
|
| 1452 |
+
return default_html, cv_default_html, None, "", None, "", None, "", None, "", None, ""
|
| 1453 |
# Extract the language code from "code: name" format
|
| 1454 |
language_code = language_selection.split(":")[0].strip()
|
| 1455 |
|
|
|
|
| 1478 |
<p style='margin: 8px 0 0 0; font-size: 11px; color: #555;'>Source: <a href='https://microsoft.github.io/linguisticdiversity/' target='_blank' style='color: #1976d2; text-decoration: none;'>Joshi et al.</a></p>
|
| 1479 |
</div>"""
|
| 1480 |
|
| 1481 |
+
cv_stats, commercial_df, asr_models_df, tts_models_df, asr_models_count, tts_models_count, asr_datasets_df, tts_datasets_df, asr_datasets_count, tts_datasets_count, logs = search_language_resources(language_code, deduplicate=deduplicate)
|
| 1482 |
+
|
| 1483 |
+
# Create Common Voice info HTML
|
| 1484 |
+
if cv_stats:
|
| 1485 |
+
cv_info_html = f"""<div style='padding: 15px; border: 2px solid #4caf50; border-radius: 4px; background-color: #ffffff;'>
|
| 1486 |
+
<div style='margin-bottom: 12px;'>
|
| 1487 |
+
<span style='font-size: 18px;'>✅</span>
|
| 1488 |
+
<span style='font-weight: bold; color: #2e7d32; font-size: 14px; margin-left: 4px;'>Available</span>
|
| 1489 |
+
</div>
|
| 1490 |
+
<table style='width: 100%; border-collapse: collapse; font-size: 13px;'>
|
| 1491 |
+
<tr>
|
| 1492 |
+
<td style='padding: 3px 8px 3px 0; color: #666; width: 45%;'>Locale</td>
|
| 1493 |
+
<td style='padding: 3px 0; color: #000; font-weight: 500;'>{cv_stats['locale']}</td>
|
| 1494 |
+
</tr>
|
| 1495 |
+
<tr>
|
| 1496 |
+
<td style='padding: 3px 8px 3px 0; color: #666;'>Valid Hours</td>
|
| 1497 |
+
<td style='padding: 3px 0; color: #000; font-weight: 500;'>{cv_stats['valid_hrs']:.1f}h</td>
|
| 1498 |
+
</tr>
|
| 1499 |
+
<tr>
|
| 1500 |
+
<td style='padding: 3px 8px 3px 0; color: #666;'>Total Hours</td>
|
| 1501 |
+
<td style='padding: 3px 0; color: #000; font-weight: 500;'>{cv_stats['total_hrs']:.1f}h</td>
|
| 1502 |
+
</tr>
|
| 1503 |
+
<tr>
|
| 1504 |
+
<td style='padding: 3px 8px 3px 0; color: #666;'>Gender</td>
|
| 1505 |
+
<td style='padding: 3px 0; color: #000; font-weight: 500;'>{cv_stats['male_pct']:.0f}% M / {cv_stats['female_pct']:.0f}% F</td>
|
| 1506 |
+
</tr>
|
| 1507 |
+
<tr>
|
| 1508 |
+
<td style='padding: 3px 8px 3px 0; color: #666;'>Version</td>
|
| 1509 |
+
<td style='padding: 3px 0; color: #000; font-weight: 500;'>{COMMON_VOICE_VERSION}</td>
|
| 1510 |
+
</tr>
|
| 1511 |
+
</table>
|
| 1512 |
+
</div>"""
|
| 1513 |
+
else:
|
| 1514 |
+
cv_info_html = """<div style='padding: 15px; border: 2px solid #e0e0e0; border-radius: 4px; background-color: #fafafa;'>
|
| 1515 |
+
<div style='margin-bottom: 8px;'>
|
| 1516 |
+
<span style='font-size: 18px;'>❌</span>
|
| 1517 |
+
<span style='font-weight: bold; color: #666; font-size: 14px; margin-left: 4px;'>Not Available</span>
|
| 1518 |
+
</div>
|
| 1519 |
+
<p style='margin: 0; color: #999; font-size: 12px;'>Not in Common Voice dataset</p>
|
| 1520 |
+
</div>"""
|
| 1521 |
|
| 1522 |
# Create count labels
|
| 1523 |
asr_models_label = f"**Found {asr_models_count} ASR model(s)**"
|
|
|
|
| 1525 |
asr_datasets_label = f"**Found {asr_datasets_count} ASR dataset(s)**"
|
| 1526 |
tts_datasets_label = f"**Found {tts_datasets_count} TTS dataset(s)**"
|
| 1527 |
|
| 1528 |
+
return metadata_html, cv_info_html, commercial_df, asr_models_label, asr_models_df, tts_models_label, tts_models_df, asr_datasets_label, asr_datasets_df, tts_datasets_label, tts_datasets_df, logs
|
| 1529 |
|
| 1530 |
# Trigger search when language is selected
|
| 1531 |
language_dropdown.change(
|
| 1532 |
fn=on_search,
|
| 1533 |
inputs=[language_dropdown, deduplicate_checkbox],
|
| 1534 |
+
outputs=[language_metadata, cv_info, commercial_table, asr_count_label, asr_models_table, tts_count_label, tts_models_table, asr_datasets_count_label, asr_datasets_table, tts_datasets_count_label, tts_datasets_table, log_textbox],
|
| 1535 |
)
|
| 1536 |
|
| 1537 |
# Trigger search when deduplicate checkbox is changed
|
| 1538 |
deduplicate_checkbox.change(
|
| 1539 |
fn=on_search,
|
| 1540 |
inputs=[language_dropdown, deduplicate_checkbox],
|
| 1541 |
+
outputs=[language_metadata, cv_info, commercial_table, asr_count_label, asr_models_table, tts_count_label, tts_models_table, asr_datasets_count_label, asr_datasets_table, tts_datasets_count_label, tts_datasets_table, log_textbox],
|
| 1542 |
)
|
| 1543 |
|
| 1544 |
if __name__ == "__main__":
|
app_content.md
CHANGED
|
@@ -14,6 +14,12 @@ Built by CLEAR Global to support language inclusion and help close the digital l
|
|
| 14 |
|
| 15 |
## Data Sources
|
| 16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
### Commercial Speech Services
|
| 18 |
|
| 19 |
Commercial service support is automatically pulled from the language support page of each service provider.
|
|
@@ -44,10 +50,17 @@ The resource classification shown for each language is based on [Joshi et al.'s
|
|
| 44 |
|
| 45 |
**Note:** This classification is from 2020 research and may not reflect the current state of resources for all languages. The landscape of speech technology is rapidly evolving, and some languages have surely gained more resources since this study was conducted.
|
| 46 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
## Disclaimer
|
| 48 |
|
| 49 |
-
-
|
| 50 |
-
- This is not an exhaustive list of speech and language technology resources. There are other commercial voice technology providers and dataset/model resources that this app doesn't cover.
|
| 51 |
- Data fetched in real-time and can change.
|
| 52 |
- Model deduplication discards models with same name uploaded by others and keeps only the most downloaded version in the list.
|
| 53 |
- A maximum of 100 dataset and model entries from Hugging Face are shown.
|
|
|
|
| 14 |
|
| 15 |
## Data Sources
|
| 16 |
|
| 17 |
+
### Common Voice
|
| 18 |
+
|
| 19 |
+
[Common Voice](https://commonvoice.mozilla.org/) is Mozilla's crowdsourced, open-source speech dataset. For languages listed here, anyone can contribute voice recordings and anyone can download the data to build speech technology. Datasets are available through the [Mozilla Data Collective](https://datacollective.mozillafoundation.org/datasets).
|
| 20 |
+
|
| 21 |
+
**Dataset Version:** 20.0 (2024-12-06)
|
| 22 |
+
|
| 23 |
### Commercial Speech Services
|
| 24 |
|
| 25 |
Commercial service support is automatically pulled from the language support page of each service provider.
|
|
|
|
| 50 |
|
| 51 |
**Note:** This classification is from 2020 research and may not reflect the current state of resources for all languages. The landscape of speech technology is rapidly evolving, and some languages have surely gained more resources since this study was conducted.
|
| 52 |
|
| 53 |
+
## Language Coverage
|
| 54 |
+
|
| 55 |
+
This tool supports **7,925 languages** based on the comprehensive [ISO 639-3 standard](https://iso639-3.sil.org/code_tables/639/data) maintained by SIL International. ISO 639-3 provides unique three-letter codes for all known human languages, including living, extinct, ancient, and constructed languages.
|
| 56 |
+
|
| 57 |
+
The language codes are mapped to both:
|
| 58 |
+
- **ISO 639-1** (2-letter codes) - Used by most modern services
|
| 59 |
+
- **ISO 639-2** (3-letter codes) - Used for broader language coverage
|
| 60 |
+
|
| 61 |
## Disclaimer
|
| 62 |
|
| 63 |
+
- This is not an exhaustive list of speech and language technology resources. There are other commercial voice technology providers and dataset/model resources that this app doesn't cover.
|
|
|
|
| 64 |
- Data fetched in real-time and can change.
|
| 65 |
- Model deduplication discards models with same name uploaded by others and keeps only the most downloaded version in the list.
|
| 66 |
- A maximum of 100 dataset and model entries from Hugging Face are shown.
|
cv-corpus-20.0-2024-12-06.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
language-codes-full.csv
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|