Alp commited on
Commit
a85d2eb
·
1 Parent(s): f52126a

CV info, full 7925 langs

Browse files
app.py CHANGED
@@ -6,20 +6,27 @@ from functools import lru_cache
6
  import csv
7
  from io import StringIO
8
  import re
 
9
 
10
  # Configuration
11
  LANGUAGE_CODES_FILE = "language-codes-full.csv"
12
  APP_CONTENT_FILE = "app_content.md"
13
  LANGUAGE_TAXONOMY_URL = "https://microsoft.github.io/linguisticdiversity/assets/lang2tax.txt"
 
 
14
 
15
  # Language list will be loaded from CSV
16
- # Structure: {alpha3_b: {"name": str, "alpha3_t": str, "alpha2": str}}
17
  LANGUAGES = {}
18
 
19
  # Language taxonomy mapping (from Joshi et al.'s linguistic diversity paper)
20
  # Structure: {language_name_lowercase: level}
21
  LANGUAGE_TAXONOMY = {}
22
 
 
 
 
 
23
  # Taxonomy level descriptions
24
  TAXONOMY_LEVELS = {
25
  0: "The Left-Behinds",
@@ -89,32 +96,27 @@ def load_language_list(csv_path=None):
89
  with open(csv_path, 'r', encoding='utf-8') as f:
90
  reader = csv.DictReader(f)
91
  for row in reader:
92
- # Use alpha3-b as primary key, fallback to alpha3-t if empty
93
- code_b = row['alpha3-b'].strip()
94
- code_t = row['alpha3-t'].strip()
95
- code_2 = row['alpha2'].strip()
96
- name = row['English'].strip()
97
-
98
- primary_code = code_b if code_b else code_t
99
-
100
- if primary_code and name:
101
- LANGUAGES[primary_code] = {
102
  "name": name,
103
- "alpha3_b": code_b,
104
- "alpha3_t": code_t,
105
- "alpha2": code_2
 
 
 
106
  }
107
  print(f"Loaded {len(LANGUAGES)} languages from {csv_path}")
108
  except Exception as e:
109
- print(f"Error loading language list: {e}")
110
- # Fallback to a minimal set
111
- LANGUAGES = {
112
- "eng": {"name": "English", "alpha3_b": "eng", "alpha3_t": "", "alpha2": "en"},
113
- "spa": {"name": "Spanish", "alpha3_b": "spa", "alpha3_t": "", "alpha2": "es"},
114
- "fra": {"name": "French", "alpha3_b": "fra", "alpha3_t": "", "alpha2": "fr"},
115
- "deu": {"name": "German", "alpha3_b": "ger", "alpha3_t": "deu", "alpha2": "de"},
116
- }
117
- print(f"Using fallback with {len(LANGUAGES)} languages")
118
 
119
  def load_language_taxonomy():
120
  """Load language taxonomy data from Microsoft's linguistic diversity project"""
@@ -138,6 +140,67 @@ def load_language_taxonomy():
138
  print(f"Warning: Could not load language taxonomy: {e}")
139
  print("Language classification will show as 'Unknown'")
140
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
  def get_taxonomy_color(level):
142
  """
143
  Get color code for taxonomy level (red for left-behind, green for winners)
@@ -987,15 +1050,25 @@ def search_language_resources(language_code, deduplicate=False):
987
  all_logs = []
988
 
989
  if not language_code:
990
- return None, None, None, 0, 0, None, None, 0, 0, ""
991
 
992
  lang_info = LANGUAGES.get(language_code)
993
  if not lang_info:
994
- return None, None, None, 0, 0, None, None, 0, 0, ""
995
 
996
  language_name = lang_info['name']
997
  all_logs.append(f"=== Searching for {language_name} ({language_code}) ===")
998
- all_logs.append(f"Language codes: alpha2={lang_info['alpha2']}, alpha3_b={lang_info['alpha3_b']}, alpha3_t={lang_info['alpha3_t']}")
 
 
 
 
 
 
 
 
 
 
999
 
1000
  # Fetch Azure data
1001
  all_logs.append("\n[Azure Speech Services]")
@@ -1250,14 +1323,15 @@ def search_language_resources(language_code, deduplicate=False):
1250
  # Combine all logs
1251
  log_text = "\n".join(all_logs)
1252
 
1253
- # Return separate ASR and TTS dataframes, plus counts for tab labels, plus logs
1254
- return commercial_df, asr_models_df, tts_models_df, len(asr_models), len(tts_models), asr_datasets_df, tts_datasets_df, len(asr_datasets), len(tts_datasets), log_text
1255
 
1256
  # Initialize - load language list and app content
1257
  print("Initializing Speech Resource Finder...")
1258
  load_app_content()
1259
  load_language_list()
1260
  load_language_taxonomy()
 
1261
 
1262
  # Create language choices for dropdown (code: name format for easy searching)
1263
  language_choices = [f"{code}: {info['name']}" for code, info in sorted(LANGUAGES.items(), key=lambda x: x[1]['name'])]
@@ -1268,7 +1342,7 @@ with gr.Blocks(title=APP_CONTENT["title"]) as demo:
1268
  gr.Markdown(APP_CONTENT["description"])
1269
 
1270
  with gr.Row(equal_height=True):
1271
- with gr.Column(scale=2):
1272
  language_dropdown = gr.Dropdown(
1273
  choices=language_choices,
1274
  label="Select Language",
@@ -1276,7 +1350,7 @@ with gr.Blocks(title=APP_CONTENT["title"]) as demo:
1276
  allow_custom_value=False,
1277
  filterable=True,
1278
  )
1279
- with gr.Column(scale=1):
1280
  language_metadata = gr.HTML(
1281
  """<div style='padding: 15px; border: 2px solid #e0e0e0; border-radius: 4px; background-color: #fafafa; height: 100%; display: flex; align-items: center; justify-content: center; box-sizing: border-box;'>
1282
  <p style='margin: 0; color: #333; font-size: 14px;'>Select a language to see resource classification</p>
@@ -1284,12 +1358,23 @@ with gr.Blocks(title=APP_CONTENT["title"]) as demo:
1284
  elem_id="language-metadata"
1285
  )
1286
 
1287
- gr.Markdown("## Commercial Services")
1288
- commercial_table = gr.Dataframe(
1289
- headers=["Service", "ASR", "TTS"],
1290
- interactive=False,
1291
- wrap=True,
1292
- )
 
 
 
 
 
 
 
 
 
 
 
1293
 
1294
  gr.Markdown("## HuggingFace Models")
1295
 
@@ -1361,7 +1446,10 @@ with gr.Blocks(title=APP_CONTENT["title"]) as demo:
1361
  default_html = """<div style='padding: 15px; border: 2px solid #e0e0e0; border-radius: 4px; background-color: #fafafa; height: 100%; display: flex; align-items: center; justify-content: center; box-sizing: border-box;'>
1362
  <p style='margin: 0; color: #333; font-size: 14px;'>Select a language to see resource classification</p>
1363
  </div>"""
1364
- return default_html, None, "", None, "", None, "", None, "", None, ""
 
 
 
1365
  # Extract the language code from "code: name" format
1366
  language_code = language_selection.split(":")[0].strip()
1367
 
@@ -1390,7 +1478,46 @@ with gr.Blocks(title=APP_CONTENT["title"]) as demo:
1390
  <p style='margin: 8px 0 0 0; font-size: 11px; color: #555;'>Source: <a href='https://microsoft.github.io/linguisticdiversity/' target='_blank' style='color: #1976d2; text-decoration: none;'>Joshi et al.</a></p>
1391
  </div>"""
1392
 
1393
- commercial_df, asr_models_df, tts_models_df, asr_models_count, tts_models_count, asr_datasets_df, tts_datasets_df, asr_datasets_count, tts_datasets_count, logs = search_language_resources(language_code, deduplicate=deduplicate)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1394
 
1395
  # Create count labels
1396
  asr_models_label = f"**Found {asr_models_count} ASR model(s)**"
@@ -1398,20 +1525,20 @@ with gr.Blocks(title=APP_CONTENT["title"]) as demo:
1398
  asr_datasets_label = f"**Found {asr_datasets_count} ASR dataset(s)**"
1399
  tts_datasets_label = f"**Found {tts_datasets_count} TTS dataset(s)**"
1400
 
1401
- return metadata_html, commercial_df, asr_models_label, asr_models_df, tts_models_label, tts_models_df, asr_datasets_label, asr_datasets_df, tts_datasets_label, tts_datasets_df, logs
1402
 
1403
  # Trigger search when language is selected
1404
  language_dropdown.change(
1405
  fn=on_search,
1406
  inputs=[language_dropdown, deduplicate_checkbox],
1407
- outputs=[language_metadata, commercial_table, asr_count_label, asr_models_table, tts_count_label, tts_models_table, asr_datasets_count_label, asr_datasets_table, tts_datasets_count_label, tts_datasets_table, log_textbox],
1408
  )
1409
 
1410
  # Trigger search when deduplicate checkbox is changed
1411
  deduplicate_checkbox.change(
1412
  fn=on_search,
1413
  inputs=[language_dropdown, deduplicate_checkbox],
1414
- outputs=[language_metadata, commercial_table, asr_count_label, asr_models_table, tts_count_label, tts_models_table, asr_datasets_count_label, asr_datasets_table, tts_datasets_count_label, tts_datasets_table, log_textbox],
1415
  )
1416
 
1417
  if __name__ == "__main__":
 
6
  import csv
7
  from io import StringIO
8
  import re
9
+ import json
10
 
11
  # Configuration
12
  LANGUAGE_CODES_FILE = "language-codes-full.csv"
13
  APP_CONTENT_FILE = "app_content.md"
14
  LANGUAGE_TAXONOMY_URL = "https://microsoft.github.io/linguisticdiversity/assets/lang2tax.txt"
15
+ COMMON_VOICE_DATA_FILE = "cv-corpus-20.0-2024-12-06.json"
16
+ COMMON_VOICE_VERSION = "20.0 (2024-12-06)"
17
 
18
  # Language list will be loaded from CSV
19
+ # Structure: {iso_639_2: {"name": str, "iso_639_1": str, "french_name": str}}
20
  LANGUAGES = {}
21
 
22
  # Language taxonomy mapping (from Joshi et al.'s linguistic diversity paper)
23
  # Structure: {language_name_lowercase: level}
24
  LANGUAGE_TAXONOMY = {}
25
 
26
+ # Common Voice dataset
27
+ # Structure: {locale_code: {validHrs: float, totalHrs: float, splits: {gender: {...}}, ...}}
28
+ COMMON_VOICE_DATA = {}
29
+
30
  # Taxonomy level descriptions
31
  TAXONOMY_LEVELS = {
32
  0: "The Left-Behinds",
 
96
  with open(csv_path, 'r', encoding='utf-8') as f:
97
  reader = csv.DictReader(f)
98
  for row in reader:
99
+ # New CSV structure: ISO 639-2, ISO 639-1, English name, French name
100
+ iso_639_2 = row['ISO 639-2'].strip()
101
+ iso_639_1 = row['ISO 639-1'].strip()
102
+ name = row['English name'].strip()
103
+ french_name = row['French name'].strip()
104
+
105
+ if iso_639_2 and name:
106
+ LANGUAGES[iso_639_2] = {
 
 
107
  "name": name,
108
+ "iso_639_1": iso_639_1,
109
+ "french_name": french_name,
110
+ # Keep legacy field names for backward compatibility
111
+ "alpha2": iso_639_1,
112
+ "alpha3_b": iso_639_2,
113
+ "alpha3_t": "" # Not used in new format
114
  }
115
  print(f"Loaded {len(LANGUAGES)} languages from {csv_path}")
116
  except Exception as e:
117
+ print(f"ERROR: Failed to load language list from {csv_path}: {e}")
118
+ print("The application cannot run without the language codes CSV file.")
119
+ LANGUAGES = {}
 
 
 
 
 
 
120
 
121
  def load_language_taxonomy():
122
  """Load language taxonomy data from Microsoft's linguistic diversity project"""
 
140
  print(f"Warning: Could not load language taxonomy: {e}")
141
  print("Language classification will show as 'Unknown'")
142
 
143
+ def load_common_voice_data():
144
+ """Load Common Voice dataset statistics"""
145
+ global COMMON_VOICE_DATA
146
+
147
+ try:
148
+ with open(COMMON_VOICE_DATA_FILE, 'r', encoding='utf-8') as f:
149
+ data = json.load(f)
150
+ COMMON_VOICE_DATA = data.get('locales', {})
151
+
152
+ print(f"Loaded Common Voice data for {len(COMMON_VOICE_DATA)} languages")
153
+ except Exception as e:
154
+ print(f"Warning: Could not load Common Voice data: {e}")
155
+ print("Common Voice statistics will not be available")
156
+
157
+ def get_common_voice_stats(language_code):
158
+ """
159
+ Get Common Voice statistics for a language
160
+ Returns dict with validHrs, totalHrs, gender balance, and locale code or None if not available
161
+ """
162
+ lang_info = LANGUAGES.get(language_code)
163
+ if not lang_info:
164
+ return None
165
+
166
+ # Try to find CV data using different code formats
167
+ # 1. Try ISO 639-2 (3-letter) code directly (e.g., "zgh", "kab")
168
+ if language_code in COMMON_VOICE_DATA:
169
+ cv_locale = language_code
170
+ cv_data = COMMON_VOICE_DATA[language_code]
171
+ # 2. Try ISO 639-1 (2-letter) code (e.g., "en", "fr")
172
+ elif lang_info.get('iso_639_1') and lang_info['iso_639_1'] in COMMON_VOICE_DATA:
173
+ cv_locale = lang_info['iso_639_1']
174
+ cv_data = COMMON_VOICE_DATA[lang_info['iso_639_1']]
175
+ # 3. Try to find any locale that starts with the 2-letter code (e.g., "fy-NL", "ga-IE")
176
+ elif lang_info.get('iso_639_1'):
177
+ iso_639_1 = lang_info['iso_639_1']
178
+ matching_locales = [loc for loc in COMMON_VOICE_DATA.keys() if loc.startswith(iso_639_1 + '-')]
179
+ if matching_locales:
180
+ cv_locale = matching_locales[0] # Take the first match
181
+ cv_data = COMMON_VOICE_DATA[cv_locale]
182
+ else:
183
+ return None
184
+ else:
185
+ return None
186
+
187
+ # Extract statistics
188
+ valid_hrs = cv_data.get('validHrs', 0)
189
+ total_hrs = cv_data.get('totalHrs', 0)
190
+
191
+ # Extract gender balance
192
+ gender_splits = cv_data.get('splits', {}).get('gender', {})
193
+ male_pct = gender_splits.get('male_masculine', 0) * 100
194
+ female_pct = gender_splits.get('female_feminine', 0) * 100
195
+
196
+ return {
197
+ 'locale': cv_locale,
198
+ 'valid_hrs': valid_hrs,
199
+ 'total_hrs': total_hrs,
200
+ 'male_pct': male_pct,
201
+ 'female_pct': female_pct
202
+ }
203
+
204
  def get_taxonomy_color(level):
205
  """
206
  Get color code for taxonomy level (red for left-behind, green for winners)
 
1050
  all_logs = []
1051
 
1052
  if not language_code:
1053
+ return None, None, None, None, 0, 0, None, None, 0, 0, ""
1054
 
1055
  lang_info = LANGUAGES.get(language_code)
1056
  if not lang_info:
1057
+ return None, None, None, None, 0, 0, None, None, 0, 0, ""
1058
 
1059
  language_name = lang_info['name']
1060
  all_logs.append(f"=== Searching for {language_name} ({language_code}) ===")
1061
+ all_logs.append(f"Language codes: ISO 639-1={lang_info['iso_639_1']}, ISO 639-2={language_code}")
1062
+
1063
+ # Check Common Voice data
1064
+ all_logs.append("\n[Common Voice Dataset]")
1065
+ cv_stats = get_common_voice_stats(language_code)
1066
+ if cv_stats:
1067
+ all_logs.append(f" ✅ Available in Common Voice (locale: {cv_stats['locale']})")
1068
+ all_logs.append(f" Valid hours: {cv_stats['valid_hrs']:.1f}h, Total hours: {cv_stats['total_hrs']:.1f}h")
1069
+ all_logs.append(f" Gender balance: {cv_stats['male_pct']:.1f}% male, {cv_stats['female_pct']:.1f}% female")
1070
+ else:
1071
+ all_logs.append(f" ❌ Not available in Common Voice")
1072
 
1073
  # Fetch Azure data
1074
  all_logs.append("\n[Azure Speech Services]")
 
1323
  # Combine all logs
1324
  log_text = "\n".join(all_logs)
1325
 
1326
+ # Return CV stats, commercial services, models, datasets, and logs
1327
+ return cv_stats, commercial_df, asr_models_df, tts_models_df, len(asr_models), len(tts_models), asr_datasets_df, tts_datasets_df, len(asr_datasets), len(tts_datasets), log_text
1328
 
1329
  # Initialize - load language list and app content
1330
  print("Initializing Speech Resource Finder...")
1331
  load_app_content()
1332
  load_language_list()
1333
  load_language_taxonomy()
1334
+ load_common_voice_data()
1335
 
1336
  # Create language choices for dropdown (code: name format for easy searching)
1337
  language_choices = [f"{code}: {info['name']}" for code, info in sorted(LANGUAGES.items(), key=lambda x: x[1]['name'])]
 
1342
  gr.Markdown(APP_CONTENT["description"])
1343
 
1344
  with gr.Row(equal_height=True):
1345
+ with gr.Column(scale=70):
1346
  language_dropdown = gr.Dropdown(
1347
  choices=language_choices,
1348
  label="Select Language",
 
1350
  allow_custom_value=False,
1351
  filterable=True,
1352
  )
1353
+ with gr.Column(scale=30):
1354
  language_metadata = gr.HTML(
1355
  """<div style='padding: 15px; border: 2px solid #e0e0e0; border-radius: 4px; background-color: #fafafa; height: 100%; display: flex; align-items: center; justify-content: center; box-sizing: border-box;'>
1356
  <p style='margin: 0; color: #333; font-size: 14px;'>Select a language to see resource classification</p>
 
1358
  elem_id="language-metadata"
1359
  )
1360
 
1361
+ with gr.Row():
1362
+ with gr.Column(scale=70):
1363
+ gr.Markdown("## Commercial Services")
1364
+ commercial_table = gr.Dataframe(
1365
+ headers=["Service", "ASR", "TTS"],
1366
+ interactive=False,
1367
+ wrap=True,
1368
+ )
1369
+
1370
+ with gr.Column(scale=30):
1371
+ gr.Markdown("## Common Voice")
1372
+ cv_info = gr.HTML(
1373
+ """<div style='padding: 15px; border: 2px solid #e0e0e0; border-radius: 4px; background-color: #fafafa;'>
1374
+ <p style='margin: 0; color: #666; font-size: 13px;'>Select a language</p>
1375
+ </div>""",
1376
+ elem_id="cv-info"
1377
+ )
1378
 
1379
  gr.Markdown("## HuggingFace Models")
1380
 
 
1446
  default_html = """<div style='padding: 15px; border: 2px solid #e0e0e0; border-radius: 4px; background-color: #fafafa; height: 100%; display: flex; align-items: center; justify-content: center; box-sizing: border-box;'>
1447
  <p style='margin: 0; color: #333; font-size: 14px;'>Select a language to see resource classification</p>
1448
  </div>"""
1449
+ cv_default_html = """<div style='padding: 15px; border: 2px solid #e0e0e0; border-radius: 4px; background-color: #fafafa;'>
1450
+ <p style='margin: 0; color: #666; font-size: 13px;'>Select a language</p>
1451
+ </div>"""
1452
+ return default_html, cv_default_html, None, "", None, "", None, "", None, "", None, ""
1453
  # Extract the language code from "code: name" format
1454
  language_code = language_selection.split(":")[0].strip()
1455
 
 
1478
  <p style='margin: 8px 0 0 0; font-size: 11px; color: #555;'>Source: <a href='https://microsoft.github.io/linguisticdiversity/' target='_blank' style='color: #1976d2; text-decoration: none;'>Joshi et al.</a></p>
1479
  </div>"""
1480
 
1481
+ cv_stats, commercial_df, asr_models_df, tts_models_df, asr_models_count, tts_models_count, asr_datasets_df, tts_datasets_df, asr_datasets_count, tts_datasets_count, logs = search_language_resources(language_code, deduplicate=deduplicate)
1482
+
1483
+ # Create Common Voice info HTML
1484
+ if cv_stats:
1485
+ cv_info_html = f"""<div style='padding: 15px; border: 2px solid #4caf50; border-radius: 4px; background-color: #ffffff;'>
1486
+ <div style='margin-bottom: 12px;'>
1487
+ <span style='font-size: 18px;'>✅</span>
1488
+ <span style='font-weight: bold; color: #2e7d32; font-size: 14px; margin-left: 4px;'>Available</span>
1489
+ </div>
1490
+ <table style='width: 100%; border-collapse: collapse; font-size: 13px;'>
1491
+ <tr>
1492
+ <td style='padding: 3px 8px 3px 0; color: #666; width: 45%;'>Locale</td>
1493
+ <td style='padding: 3px 0; color: #000; font-weight: 500;'>{cv_stats['locale']}</td>
1494
+ </tr>
1495
+ <tr>
1496
+ <td style='padding: 3px 8px 3px 0; color: #666;'>Valid Hours</td>
1497
+ <td style='padding: 3px 0; color: #000; font-weight: 500;'>{cv_stats['valid_hrs']:.1f}h</td>
1498
+ </tr>
1499
+ <tr>
1500
+ <td style='padding: 3px 8px 3px 0; color: #666;'>Total Hours</td>
1501
+ <td style='padding: 3px 0; color: #000; font-weight: 500;'>{cv_stats['total_hrs']:.1f}h</td>
1502
+ </tr>
1503
+ <tr>
1504
+ <td style='padding: 3px 8px 3px 0; color: #666;'>Gender</td>
1505
+ <td style='padding: 3px 0; color: #000; font-weight: 500;'>{cv_stats['male_pct']:.0f}% M / {cv_stats['female_pct']:.0f}% F</td>
1506
+ </tr>
1507
+ <tr>
1508
+ <td style='padding: 3px 8px 3px 0; color: #666;'>Version</td>
1509
+ <td style='padding: 3px 0; color: #000; font-weight: 500;'>{COMMON_VOICE_VERSION}</td>
1510
+ </tr>
1511
+ </table>
1512
+ </div>"""
1513
+ else:
1514
+ cv_info_html = """<div style='padding: 15px; border: 2px solid #e0e0e0; border-radius: 4px; background-color: #fafafa;'>
1515
+ <div style='margin-bottom: 8px;'>
1516
+ <span style='font-size: 18px;'>❌</span>
1517
+ <span style='font-weight: bold; color: #666; font-size: 14px; margin-left: 4px;'>Not Available</span>
1518
+ </div>
1519
+ <p style='margin: 0; color: #999; font-size: 12px;'>Not in Common Voice dataset</p>
1520
+ </div>"""
1521
 
1522
  # Create count labels
1523
  asr_models_label = f"**Found {asr_models_count} ASR model(s)**"
 
1525
  asr_datasets_label = f"**Found {asr_datasets_count} ASR dataset(s)**"
1526
  tts_datasets_label = f"**Found {tts_datasets_count} TTS dataset(s)**"
1527
 
1528
+ return metadata_html, cv_info_html, commercial_df, asr_models_label, asr_models_df, tts_models_label, tts_models_df, asr_datasets_label, asr_datasets_df, tts_datasets_label, tts_datasets_df, logs
1529
 
1530
  # Trigger search when language is selected
1531
  language_dropdown.change(
1532
  fn=on_search,
1533
  inputs=[language_dropdown, deduplicate_checkbox],
1534
+ outputs=[language_metadata, cv_info, commercial_table, asr_count_label, asr_models_table, tts_count_label, tts_models_table, asr_datasets_count_label, asr_datasets_table, tts_datasets_count_label, tts_datasets_table, log_textbox],
1535
  )
1536
 
1537
  # Trigger search when deduplicate checkbox is changed
1538
  deduplicate_checkbox.change(
1539
  fn=on_search,
1540
  inputs=[language_dropdown, deduplicate_checkbox],
1541
+ outputs=[language_metadata, cv_info, commercial_table, asr_count_label, asr_models_table, tts_count_label, tts_models_table, asr_datasets_count_label, asr_datasets_table, tts_datasets_count_label, tts_datasets_table, log_textbox],
1542
  )
1543
 
1544
  if __name__ == "__main__":
app_content.md CHANGED
@@ -14,6 +14,12 @@ Built by CLEAR Global to support language inclusion and help close the digital l
14
 
15
  ## Data Sources
16
 
 
 
 
 
 
 
17
  ### Commercial Speech Services
18
 
19
  Commercial service support is automatically pulled from the language support page of each service provider.
@@ -44,10 +50,17 @@ The resource classification shown for each language is based on [Joshi et al.'s
44
 
45
  **Note:** This classification is from 2020 research and may not reflect the current state of resources for all languages. The landscape of speech technology is rapidly evolving, and some languages have surely gained more resources since this study was conducted.
46
 
 
 
 
 
 
 
 
 
47
  ## Disclaimer
48
 
49
- - The language list only contains 487 languages and is taken from this [Github repository](https://github.com/datasets/language-codes).
50
- - This is not an exhaustive list of speech and language technology resources. There are other commercial voice technology providers and dataset/model resources that this app doesn't cover.
51
  - Data fetched in real-time and can change.
52
  - Model deduplication discards models with same name uploaded by others and keeps only the most downloaded version in the list.
53
  - A maximum of 100 dataset and model entries from Hugging Face are shown.
 
14
 
15
  ## Data Sources
16
 
17
+ ### Common Voice
18
+
19
+ [Common Voice](https://commonvoice.mozilla.org/) is Mozilla's crowdsourced, open-source speech dataset. For languages listed here, anyone can contribute voice recordings and anyone can download the data to build speech technology. Datasets are available through the [Mozilla Data Collective](https://datacollective.mozillafoundation.org/datasets).
20
+
21
+ **Dataset Version:** 20.0 (2024-12-06)
22
+
23
  ### Commercial Speech Services
24
 
25
  Commercial service support is automatically pulled from the language support page of each service provider.
 
50
 
51
  **Note:** This classification is from 2020 research and may not reflect the current state of resources for all languages. The landscape of speech technology is rapidly evolving, and some languages have surely gained more resources since this study was conducted.
52
 
53
+ ## Language Coverage
54
+
55
+ This tool supports **7,925 languages** based on the comprehensive [ISO 639-3 standard](https://iso639-3.sil.org/code_tables/639/data) maintained by SIL International. ISO 639-3 provides unique three-letter codes for all known human languages, including living, extinct, ancient, and constructed languages.
56
+
57
+ The language codes are mapped to both:
58
+ - **ISO 639-1** (2-letter codes) - Used by most modern services
59
+ - **ISO 639-2** (3-letter codes) - Used for broader language coverage
60
+
61
  ## Disclaimer
62
 
63
+ - This is not an exhaustive list of speech and language technology resources. There are other commercial voice technology providers and dataset/model resources that this app doesn't cover.
 
64
  - Data fetched in real-time and can change.
65
  - Model deduplication discards models with same name uploaded by others and keeps only the most downloaded version in the list.
66
  - A maximum of 100 dataset and model entries from Hugging Face are shown.
cv-corpus-20.0-2024-12-06.json ADDED
The diff for this file is too large to render. See raw diff
 
language-codes-full.csv CHANGED
The diff for this file is too large to render. See raw diff