Alp commited on
Commit
20e07f3
·
1 Parent(s): 4af7226
Files changed (3) hide show
  1. __pycache__/app.cpython-312.pyc +0 -0
  2. app.py +141 -27
  3. app_content.md +16 -3
__pycache__/app.cpython-312.pyc ADDED
Binary file (56 kB). View file
 
app.py CHANGED
@@ -10,11 +10,26 @@ import re
10
  # Configuration
11
  LANGUAGE_CODES_FILE = "language-codes-full.csv"
12
  APP_CONTENT_FILE = "app_content.md"
 
13
 
14
  # Language list will be loaded from CSV
15
  # Structure: {alpha3_b: {"name": str, "alpha3_t": str, "alpha2": str}}
16
  LANGUAGES = {}
17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  # App content will be loaded from markdown file
19
  APP_CONTENT = {
20
  "title": "Speech Resource Finder",
@@ -101,6 +116,74 @@ def load_language_list(csv_path=None):
101
  }
102
  print(f"Using fallback with {len(LANGUAGES)} languages")
103
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
  @lru_cache(maxsize=1)
105
  def fetch_azure_asr_languages():
106
  """Scrape Azure Speech-to-Text supported languages"""
@@ -1174,6 +1257,7 @@ def search_language_resources(language_code, deduplicate=False):
1174
  print("Initializing Speech Resource Finder...")
1175
  load_app_content()
1176
  load_language_list()
 
1177
 
1178
  # Create language choices for dropdown (code: name format for easy searching)
1179
  language_choices = [f"{code}: {info['name']}" for code, info in sorted(LANGUAGES.items(), key=lambda x: x[1]['name'])]
@@ -1183,23 +1267,23 @@ with gr.Blocks(title=APP_CONTENT["title"]) as demo:
1183
  gr.Markdown(f"# {APP_CONTENT['title']}")
1184
  gr.Markdown(APP_CONTENT["description"])
1185
 
1186
- with gr.Row():
1187
- language_dropdown = gr.Dropdown(
1188
- choices=language_choices,
1189
- label="Select Language",
1190
- info="Type to search for a language",
1191
- allow_custom_value=False,
1192
- filterable=True,
1193
- )
1194
- search_btn = gr.Button("Search", variant="primary")
 
 
 
 
 
 
 
1195
 
1196
- with gr.Row():
1197
- deduplicate_checkbox = gr.Checkbox(
1198
- label="Deduplicate models",
1199
- value=True,
1200
- info="Keep only the model with most downloads for each base name"
1201
- )
1202
-
1203
  gr.Markdown("## Commercial Services")
1204
  commercial_table = gr.Dataframe(
1205
  headers=["Service", "ASR", "TTS"],
@@ -1209,6 +1293,13 @@ with gr.Blocks(title=APP_CONTENT["title"]) as demo:
1209
 
1210
  gr.Markdown("## HuggingFace Models")
1211
 
 
 
 
 
 
 
 
1212
  # Create tabs for ASR and TTS models with count labels
1213
  with gr.Tabs():
1214
  with gr.Tab(label="ASR Models") as asr_tab:
@@ -1266,9 +1357,38 @@ with gr.Blocks(title=APP_CONTENT["title"]) as demo:
1266
 
1267
  def on_search(language_selection, deduplicate):
1268
  if not language_selection:
1269
- return None, "", None, "", None, "", None, "", None, ""
 
 
 
1270
  # Extract the language code from "code: name" format
1271
  language_code = language_selection.split(":")[0].strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1272
  commercial_df, asr_models_df, tts_models_df, asr_models_count, tts_models_count, asr_datasets_df, tts_datasets_df, asr_datasets_count, tts_datasets_count, logs = search_language_resources(language_code, deduplicate=deduplicate)
1273
 
1274
  # Create count labels
@@ -1277,26 +1397,20 @@ with gr.Blocks(title=APP_CONTENT["title"]) as demo:
1277
  asr_datasets_label = f"**Found {asr_datasets_count} ASR dataset(s)**"
1278
  tts_datasets_label = f"**Found {tts_datasets_count} TTS dataset(s)**"
1279
 
1280
- return commercial_df, asr_models_label, asr_models_df, tts_models_label, tts_models_df, asr_datasets_label, asr_datasets_df, tts_datasets_label, tts_datasets_df, logs
1281
-
1282
- search_btn.click(
1283
- fn=on_search,
1284
- inputs=[language_dropdown, deduplicate_checkbox],
1285
- outputs=[commercial_table, asr_count_label, asr_models_table, tts_count_label, tts_models_table, asr_datasets_count_label, asr_datasets_table, tts_datasets_count_label, tts_datasets_table, log_textbox],
1286
- )
1287
 
1288
- # Also trigger search when language is selected
1289
  language_dropdown.change(
1290
  fn=on_search,
1291
  inputs=[language_dropdown, deduplicate_checkbox],
1292
- outputs=[commercial_table, asr_count_label, asr_models_table, tts_count_label, tts_models_table, asr_datasets_count_label, asr_datasets_table, tts_datasets_count_label, tts_datasets_table, log_textbox],
1293
  )
1294
 
1295
  # Trigger search when deduplicate checkbox is changed
1296
  deduplicate_checkbox.change(
1297
  fn=on_search,
1298
  inputs=[language_dropdown, deduplicate_checkbox],
1299
- outputs=[commercial_table, asr_count_label, asr_models_table, tts_count_label, tts_models_table, asr_datasets_count_label, asr_datasets_table, tts_datasets_count_label, tts_datasets_table, log_textbox],
1300
  )
1301
 
1302
  if __name__ == "__main__":
 
10
  # Configuration
11
  LANGUAGE_CODES_FILE = "language-codes-full.csv"
12
  APP_CONTENT_FILE = "app_content.md"
13
+ LANGUAGE_TAXONOMY_URL = "https://microsoft.github.io/linguisticdiversity/assets/lang2tax.txt"
14
 
15
  # Language list will be loaded from CSV
16
  # Structure: {alpha3_b: {"name": str, "alpha3_t": str, "alpha2": str}}
17
  LANGUAGES = {}
18
 
19
+ # Language taxonomy mapping (from Joshi et al.'s linguistic diversity paper)
20
+ # Structure: {language_name_lowercase: level}
21
+ LANGUAGE_TAXONOMY = {}
22
+
23
+ # Taxonomy level descriptions
24
+ TAXONOMY_LEVELS = {
25
+ 0: "The Left-Behinds",
26
+ 1: "The Scraping-Bys",
27
+ 2: "The Hopefuls",
28
+ 3: "The Rising Stars",
29
+ 4: "The Underdogs",
30
+ 5: "The Winners"
31
+ }
32
+
33
  # App content will be loaded from markdown file
34
  APP_CONTENT = {
35
  "title": "Speech Resource Finder",
 
116
  }
117
  print(f"Using fallback with {len(LANGUAGES)} languages")
118
 
119
+ def load_language_taxonomy():
120
+ """Load language taxonomy data from Microsoft's linguistic diversity project"""
121
+ global LANGUAGE_TAXONOMY
122
+
123
+ try:
124
+ response = requests.get(LANGUAGE_TAXONOMY_URL, timeout=10)
125
+ response.raise_for_status()
126
+
127
+ # Parse the CSV-like content (format: language_name,level)
128
+ for line in response.text.strip().split('\n'):
129
+ if line.strip():
130
+ parts = line.strip().split(',')
131
+ if len(parts) == 2:
132
+ lang_name = parts[0].strip().lower()
133
+ level = int(parts[1].strip())
134
+ LANGUAGE_TAXONOMY[lang_name] = level
135
+
136
+ print(f"Loaded taxonomy data for {len(LANGUAGE_TAXONOMY)} languages")
137
+ except Exception as e:
138
+ print(f"Warning: Could not load language taxonomy: {e}")
139
+ print("Language classification will show as 'Unknown'")
140
+
141
+ def get_taxonomy_color(level):
142
+ """
143
+ Get color code for taxonomy level (red for left-behind, green for winners)
144
+ """
145
+ colors = {
146
+ 0: "#d32f2f", # Red - The Left-Behinds
147
+ 1: "#f57c00", # Orange - The Scraping-Bys
148
+ 2: "#fbc02d", # Yellow - The Hopefuls
149
+ 3: "#afb42b", # Yellow-green - The Rising Stars
150
+ 4: "#7cb342", # Light green - The Underdogs
151
+ 5: "#388e3c", # Green - The Winners
152
+ }
153
+ return colors.get(level, "#757575") # Gray for unknown
154
+
155
+ def get_language_taxonomy_info(language_name):
156
+ """
157
+ Get taxonomy classification for a language.
158
+ Returns a tuple of (level, description) or (None, "Unknown")
159
+ """
160
+ if not language_name:
161
+ return None, "Unknown"
162
+
163
+ # Try exact match (case-insensitive)
164
+ lang_lower = language_name.lower()
165
+ if lang_lower in LANGUAGE_TAXONOMY:
166
+ level = LANGUAGE_TAXONOMY[lang_lower]
167
+ return level, TAXONOMY_LEVELS.get(level, f"Level {level}")
168
+
169
+ # Try with semicolon-separated alternative names (e.g., "Catalan; Valencian")
170
+ if ';' in lang_lower:
171
+ parts = [p.strip() for p in lang_lower.split(';')]
172
+ for part in parts:
173
+ if part in LANGUAGE_TAXONOMY:
174
+ level = LANGUAGE_TAXONOMY[part]
175
+ return level, TAXONOMY_LEVELS.get(level, f"Level {level}")
176
+
177
+ # Try with comma-separated variations (e.g., "Chinese, Mandarin")
178
+ if ',' in lang_lower:
179
+ parts = [p.strip() for p in lang_lower.split(',')]
180
+ for part in parts:
181
+ if part in LANGUAGE_TAXONOMY:
182
+ level = LANGUAGE_TAXONOMY[part]
183
+ return level, TAXONOMY_LEVELS.get(level, f"Level {level}")
184
+
185
+ return None, "Unknown"
186
+
187
  @lru_cache(maxsize=1)
188
  def fetch_azure_asr_languages():
189
  """Scrape Azure Speech-to-Text supported languages"""
 
1257
  print("Initializing Speech Resource Finder...")
1258
  load_app_content()
1259
  load_language_list()
1260
+ load_language_taxonomy()
1261
 
1262
  # Create language choices for dropdown (code: name format for easy searching)
1263
  language_choices = [f"{code}: {info['name']}" for code, info in sorted(LANGUAGES.items(), key=lambda x: x[1]['name'])]
 
1267
  gr.Markdown(f"# {APP_CONTENT['title']}")
1268
  gr.Markdown(APP_CONTENT["description"])
1269
 
1270
+ with gr.Row(equal_height=True):
1271
+ with gr.Column(scale=2):
1272
+ language_dropdown = gr.Dropdown(
1273
+ choices=language_choices,
1274
+ label="Select Language",
1275
+ info="Type to search for a language",
1276
+ allow_custom_value=False,
1277
+ filterable=True,
1278
+ )
1279
+ with gr.Column(scale=1):
1280
+ language_metadata = gr.HTML(
1281
+ """<div style='padding: 15px; border: 2px solid #e0e0e0; border-radius: 4px; background-color: #fafafa; height: 100%; display: flex; align-items: center; justify-content: center; box-sizing: border-box;'>
1282
+ <p style='margin: 0; color: #333; font-size: 14px;'>Select a language to see resource classification</p>
1283
+ </div>""",
1284
+ elem_id="language-metadata"
1285
+ )
1286
 
 
 
 
 
 
 
 
1287
  gr.Markdown("## Commercial Services")
1288
  commercial_table = gr.Dataframe(
1289
  headers=["Service", "ASR", "TTS"],
 
1293
 
1294
  gr.Markdown("## HuggingFace Models")
1295
 
1296
+ with gr.Row():
1297
+ deduplicate_checkbox = gr.Checkbox(
1298
+ label="Deduplicate models",
1299
+ value=True,
1300
+ info="Keep only the model with most downloads for each base name"
1301
+ )
1302
+
1303
  # Create tabs for ASR and TTS models with count labels
1304
  with gr.Tabs():
1305
  with gr.Tab(label="ASR Models") as asr_tab:
 
1357
 
1358
  def on_search(language_selection, deduplicate):
1359
  if not language_selection:
1360
+ default_html = """<div style='padding: 15px; border: 2px solid #e0e0e0; border-radius: 4px; background-color: #fafafa; height: 100%; display: flex; align-items: center; justify-content: center; box-sizing: border-box;'>
1361
+ <p style='margin: 0; color: #333; font-size: 14px;'>Select a language to see resource classification</p>
1362
+ </div>"""
1363
+ return default_html, None, "", None, "", None, "", None, "", None, ""
1364
  # Extract the language code from "code: name" format
1365
  language_code = language_selection.split(":")[0].strip()
1366
+
1367
+ # Get language name for taxonomy lookup
1368
+ language_name = LANGUAGES.get(language_code, {}).get("name", "")
1369
+
1370
+ # Get taxonomy classification
1371
+ level, classification = get_language_taxonomy_info(language_name)
1372
+
1373
+ # Create metadata display with color coding
1374
+ if level is not None:
1375
+ color = get_taxonomy_color(level)
1376
+ metadata_html = f"""<div style='padding: 15px; border: 2px solid {color}; border-radius: 4px; background-color: #ffffff; height: 100%; box-sizing: border-box; display: flex; flex-direction: column; justify-content: center;'>
1377
+ <h4 style='margin: 0 0 8px 0; color: #333; font-size: 16px;'>{language_name}</h4>
1378
+ <div style='margin: 8px 0;'>
1379
+ <span style='padding: 6px 12px; background-color: {color}; color: white; border-radius: 4px; font-weight: bold; font-size: 13px;'>{classification}</span>
1380
+ </div>
1381
+ <p style='margin: 8px 0 0 0; font-size: 11px; color: #555;'>Source: <a href='https://microsoft.github.io/linguisticdiversity/' target='_blank' style='color: #1976d2; text-decoration: none;'>Joshi et al.</a></p>
1382
+ </div>"""
1383
+ else:
1384
+ metadata_html = f"""<div style='padding: 15px; border: 2px solid #757575; border-radius: 4px; background-color: #ffffff; height: 100%; box-sizing: border-box; display: flex; flex-direction: column; justify-content: center;'>
1385
+ <h4 style='margin: 0 0 8px 0; color: #333; font-size: 16px;'>{language_name}</h4>
1386
+ <div style='margin: 8px 0;'>
1387
+ <span style='padding: 6px 12px; background-color: #757575; color: white; border-radius: 4px; font-weight: bold; font-size: 13px;'>Unknown</span>
1388
+ </div>
1389
+ <p style='margin: 8px 0 0 0; font-size: 11px; color: #555;'>Source: <a href='https://microsoft.github.io/linguisticdiversity/' target='_blank' style='color: #1976d2; text-decoration: none;'>Joshi et al.</a></p>
1390
+ </div>"""
1391
+
1392
  commercial_df, asr_models_df, tts_models_df, asr_models_count, tts_models_count, asr_datasets_df, tts_datasets_df, asr_datasets_count, tts_datasets_count, logs = search_language_resources(language_code, deduplicate=deduplicate)
1393
 
1394
  # Create count labels
 
1397
  asr_datasets_label = f"**Found {asr_datasets_count} ASR dataset(s)**"
1398
  tts_datasets_label = f"**Found {tts_datasets_count} TTS dataset(s)**"
1399
 
1400
+ return metadata_html, commercial_df, asr_models_label, asr_models_df, tts_models_label, tts_models_df, asr_datasets_label, asr_datasets_df, tts_datasets_label, tts_datasets_df, logs
 
 
 
 
 
 
1401
 
1402
+ # Trigger search when language is selected
1403
  language_dropdown.change(
1404
  fn=on_search,
1405
  inputs=[language_dropdown, deduplicate_checkbox],
1406
+ outputs=[language_metadata, commercial_table, asr_count_label, asr_models_table, tts_count_label, tts_models_table, asr_datasets_count_label, asr_datasets_table, tts_datasets_count_label, tts_datasets_table, log_textbox],
1407
  )
1408
 
1409
  # Trigger search when deduplicate checkbox is changed
1410
  deduplicate_checkbox.change(
1411
  fn=on_search,
1412
  inputs=[language_dropdown, deduplicate_checkbox],
1413
+ outputs=[language_metadata, commercial_table, asr_count_label, asr_models_table, tts_count_label, tts_models_table, asr_datasets_count_label, asr_datasets_table, tts_datasets_count_label, tts_datasets_table, log_textbox],
1414
  )
1415
 
1416
  if __name__ == "__main__":
app_content.md CHANGED
@@ -31,12 +31,25 @@ Commercial service support is automatically pulled from the language support pag
31
  - [ASR Datasets](https://huggingface.co/datasets?task_categories=task_categories:automatic-speech-recognition)
32
  - [TTS Datasets](https://huggingface.co/datasets?task_categories=task_categories:text-to-speech)
33
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  ## Disclaimer
35
 
36
- - Currently lists only 487 languages and is taken from this [Github repository](https://github.com/datasets/language-codes).
37
  - Data fetched in real-time and can change.
38
- - This is not an exhaustive list. There are other commercial voice technology providers and dataset/model resources that this app doesn't cover.
39
- - Deduplication discards models with same name uploaded by others and keeps the most downloaded version in the list.
40
 
41
  ## Feedback
42
 
 
31
  - [ASR Datasets](https://huggingface.co/datasets?task_categories=task_categories:automatic-speech-recognition)
32
  - [TTS Datasets](https://huggingface.co/datasets?task_categories=task_categories:text-to-speech)
33
 
34
+ ## Language Resource Classification
35
+
36
+ The resource classification shown for each language is based on [Joshi et al.'s 2020 research on linguistic diversity in NLP](https://microsoft.github.io/linguisticdiversity/). This study categorized languages into 6 levels based on their representation in language technology resources:
37
+
38
+ - **Level 5: The Winners** - Languages with the most resources
39
+ - **Level 4: The Underdogs** - Languages with moderate resources
40
+ - **Level 3: The Rising Stars** - Languages with growing resources
41
+ - **Level 2: The Hopefuls** - Languages with limited resources
42
+ - **Level 1: The Scraping-Bys** - Languages with very few resources
43
+ - **Level 0: The Left-Behinds** - Languages with almost no resources
44
+
45
+ **Note:** This classification is from 2020 research and may not reflect the current state of resources for all languages. The landscape of speech technology is rapidly evolving, and some languages have surely gained more resources since this study was conducted.
46
+
47
  ## Disclaimer
48
 
49
+ - The language list only contains 487 languages and is taken from this [Github repository](https://github.com/datasets/language-codes).
50
  - Data fetched in real-time and can change.
51
+ - This is not an exhaustive list of speech and language technology resources. There are other commercial voice technology providers and dataset/model resources that this app doesn't cover.
52
+ - Model deduplication discards models with same name uploaded by others and keeps only the most downloaded version in the list.
53
 
54
  ## Feedback
55