Spaces:

CLEAR-Global
/

speech-resource-finder

Running

App Files Files Community

Alp commited on Nov 13

Commit

20e07f3

1 Parent(s): 4af7226

joshi

Browse files

Files changed (3) hide show

__pycache__/app.cpython-312.pyc +0 -0
app.py +141 -27
app_content.md +16 -3

__pycache__/app.cpython-312.pyc ADDED Viewed

Binary file (56 kB). View file

app.py CHANGED Viewed

@@ -10,11 +10,26 @@ import re
 # Configuration
 LANGUAGE_CODES_FILE = "language-codes-full.csv"
 APP_CONTENT_FILE = "app_content.md"
 # Language list will be loaded from CSV
 # Structure: {alpha3_b: {"name": str, "alpha3_t": str, "alpha2": str}}
 LANGUAGES = {}
 # App content will be loaded from markdown file
 APP_CONTENT = {
     "title": "Speech Resource Finder",
@@ -101,6 +116,74 @@ def load_language_list(csv_path=None):
         }
         print(f"Using fallback with {len(LANGUAGES)} languages")
 @lru_cache(maxsize=1)
 def fetch_azure_asr_languages():
     """Scrape Azure Speech-to-Text supported languages"""
@@ -1174,6 +1257,7 @@ def search_language_resources(language_code, deduplicate=False):
 print("Initializing Speech Resource Finder...")
 load_app_content()
 load_language_list()
 # Create language choices for dropdown (code: name format for easy searching)
 language_choices = [f"{code}: {info['name']}" for code, info in sorted(LANGUAGES.items(), key=lambda x: x[1]['name'])]
@@ -1183,23 +1267,23 @@ with gr.Blocks(title=APP_CONTENT["title"]) as demo:
     gr.Markdown(f"# {APP_CONTENT['title']}")
     gr.Markdown(APP_CONTENT["description"])
-    with gr.Row():
-        language_dropdown = gr.Dropdown(
-            choices=language_choices,
-            label="Select Language",
-            info="Type to search for a language",
-            allow_custom_value=False,
-            filterable=True,
-        )
-        search_btn = gr.Button("Search", variant="primary")
-    with gr.Row():
-        deduplicate_checkbox = gr.Checkbox(
-            label="Deduplicate models",
-            value=True,
-            info="Keep only the model with most downloads for each base name"
-        )
     gr.Markdown("## Commercial Services")
     commercial_table = gr.Dataframe(
         headers=["Service", "ASR", "TTS"],
@@ -1209,6 +1293,13 @@ with gr.Blocks(title=APP_CONTENT["title"]) as demo:
     gr.Markdown("## HuggingFace Models")
     # Create tabs for ASR and TTS models with count labels
     with gr.Tabs():
         with gr.Tab(label="ASR Models") as asr_tab:
@@ -1266,9 +1357,38 @@ with gr.Blocks(title=APP_CONTENT["title"]) as demo:
     def on_search(language_selection, deduplicate):
         if not language_selection:
-            return None, "", None, "", None, "", None, "", None, ""
         # Extract the language code from "code: name" format
         language_code = language_selection.split(":")[0].strip()
         commercial_df, asr_models_df, tts_models_df, asr_models_count, tts_models_count, asr_datasets_df, tts_datasets_df, asr_datasets_count, tts_datasets_count, logs = search_language_resources(language_code, deduplicate=deduplicate)
         # Create count labels
@@ -1277,26 +1397,20 @@ with gr.Blocks(title=APP_CONTENT["title"]) as demo:
         asr_datasets_label = f"**Found {asr_datasets_count} ASR dataset(s)**"
         tts_datasets_label = f"**Found {tts_datasets_count} TTS dataset(s)**"
-        return commercial_df, asr_models_label, asr_models_df, tts_models_label, tts_models_df, asr_datasets_label, asr_datasets_df, tts_datasets_label, tts_datasets_df, logs
-    search_btn.click(
-        fn=on_search,
-        inputs=[language_dropdown, deduplicate_checkbox],
-        outputs=[commercial_table, asr_count_label, asr_models_table, tts_count_label, tts_models_table, asr_datasets_count_label, asr_datasets_table, tts_datasets_count_label, tts_datasets_table, log_textbox],
-    )
-    # Also trigger search when language is selected
     language_dropdown.change(
         fn=on_search,
         inputs=[language_dropdown, deduplicate_checkbox],
-        outputs=[commercial_table, asr_count_label, asr_models_table, tts_count_label, tts_models_table, asr_datasets_count_label, asr_datasets_table, tts_datasets_count_label, tts_datasets_table, log_textbox],
     )
     # Trigger search when deduplicate checkbox is changed
     deduplicate_checkbox.change(
         fn=on_search,
         inputs=[language_dropdown, deduplicate_checkbox],
-        outputs=[commercial_table, asr_count_label, asr_models_table, tts_count_label, tts_models_table, asr_datasets_count_label, asr_datasets_table, tts_datasets_count_label, tts_datasets_table, log_textbox],
     )
 if __name__ == "__main__":

 # Configuration
 LANGUAGE_CODES_FILE = "language-codes-full.csv"
 APP_CONTENT_FILE = "app_content.md"
+LANGUAGE_TAXONOMY_URL = "https://microsoft.github.io/linguisticdiversity/assets/lang2tax.txt"
 # Language list will be loaded from CSV
 # Structure: {alpha3_b: {"name": str, "alpha3_t": str, "alpha2": str}}
 LANGUAGES = {}
+# Language taxonomy mapping (from Joshi et al.'s linguistic diversity paper)
+# Structure: {language_name_lowercase: level}
+LANGUAGE_TAXONOMY = {}
+# Taxonomy level descriptions
+TAXONOMY_LEVELS = {
+    0: "The Left-Behinds",
+    1: "The Scraping-Bys",
+    2: "The Hopefuls",
+    3: "The Rising Stars",
+    4: "The Underdogs",
+    5: "The Winners"
+}
 # App content will be loaded from markdown file
 APP_CONTENT = {
     "title": "Speech Resource Finder",
         }
         print(f"Using fallback with {len(LANGUAGES)} languages")
+def load_language_taxonomy():
+    """Load language taxonomy data from Microsoft's linguistic diversity project"""
+    global LANGUAGE_TAXONOMY
+    try:
+        response = requests.get(LANGUAGE_TAXONOMY_URL, timeout=10)
+        response.raise_for_status()
+        # Parse the CSV-like content (format: language_name,level)
+        for line in response.text.strip().split('\n'):
+            if line.strip():
+                parts = line.strip().split(',')
+                if len(parts) == 2:
+                    lang_name = parts[0].strip().lower()
+                    level = int(parts[1].strip())
+                    LANGUAGE_TAXONOMY[lang_name] = level
+        print(f"Loaded taxonomy data for {len(LANGUAGE_TAXONOMY)} languages")
+    except Exception as e:
+        print(f"Warning: Could not load language taxonomy: {e}")
+        print("Language classification will show as 'Unknown'")
+def get_taxonomy_color(level):
+    """
+    Get color code for taxonomy level (red for left-behind, green for winners)
+    """
+    colors = {
+        0: "#d32f2f",  # Red - The Left-Behinds
+        1: "#f57c00",  # Orange - The Scraping-Bys
+        2: "#fbc02d",  # Yellow - The Hopefuls
+        3: "#afb42b",  # Yellow-green - The Rising Stars
+        4: "#7cb342",  # Light green - The Underdogs
+        5: "#388e3c",  # Green - The Winners
+    }
+    return colors.get(level, "#757575")  # Gray for unknown
+def get_language_taxonomy_info(language_name):
+    """
+    Get taxonomy classification for a language.
+    Returns a tuple of (level, description) or (None, "Unknown")
+    """
+    if not language_name:
+        return None, "Unknown"
+    # Try exact match (case-insensitive)
+    lang_lower = language_name.lower()
+    if lang_lower in LANGUAGE_TAXONOMY:
+        level = LANGUAGE_TAXONOMY[lang_lower]
+        return level, TAXONOMY_LEVELS.get(level, f"Level {level}")
+    # Try with semicolon-separated alternative names (e.g., "Catalan; Valencian")
+    if ';' in lang_lower:
+        parts = [p.strip() for p in lang_lower.split(';')]
+        for part in parts:
+            if part in LANGUAGE_TAXONOMY:
+                level = LANGUAGE_TAXONOMY[part]
+                return level, TAXONOMY_LEVELS.get(level, f"Level {level}")
+    # Try with comma-separated variations (e.g., "Chinese, Mandarin")
+    if ',' in lang_lower:
+        parts = [p.strip() for p in lang_lower.split(',')]
+        for part in parts:
+            if part in LANGUAGE_TAXONOMY:
+                level = LANGUAGE_TAXONOMY[part]
+                return level, TAXONOMY_LEVELS.get(level, f"Level {level}")
+    return None, "Unknown"
 @lru_cache(maxsize=1)
 def fetch_azure_asr_languages():
     """Scrape Azure Speech-to-Text supported languages"""
 print("Initializing Speech Resource Finder...")
 load_app_content()
 load_language_list()
+load_language_taxonomy()
 # Create language choices for dropdown (code: name format for easy searching)
 language_choices = [f"{code}: {info['name']}" for code, info in sorted(LANGUAGES.items(), key=lambda x: x[1]['name'])]
     gr.Markdown(f"# {APP_CONTENT['title']}")
     gr.Markdown(APP_CONTENT["description"])
+    with gr.Row(equal_height=True):
+        with gr.Column(scale=2):
+            language_dropdown = gr.Dropdown(
+                choices=language_choices,
+                label="Select Language",
+                info="Type to search for a language",
+                allow_custom_value=False,
+                filterable=True,
+            )
+        with gr.Column(scale=1):
+            language_metadata = gr.HTML(
+                """<div style='padding: 15px; border: 2px solid #e0e0e0; border-radius: 4px; background-color: #fafafa; height: 100%; display: flex; align-items: center; justify-content: center; box-sizing: border-box;'>
+                <p style='margin: 0; color: #333; font-size: 14px;'>Select a language to see resource classification</p>
+                </div>""",
+                elem_id="language-metadata"
+            )
     gr.Markdown("## Commercial Services")
     commercial_table = gr.Dataframe(
         headers=["Service", "ASR", "TTS"],
     gr.Markdown("## HuggingFace Models")
+    with gr.Row():
+        deduplicate_checkbox = gr.Checkbox(
+            label="Deduplicate models",
+            value=True,
+            info="Keep only the model with most downloads for each base name"
+        )
     # Create tabs for ASR and TTS models with count labels
     with gr.Tabs():
         with gr.Tab(label="ASR Models") as asr_tab:
     def on_search(language_selection, deduplicate):
         if not language_selection:
+            default_html = """<div style='padding: 15px; border: 2px solid #e0e0e0; border-radius: 4px; background-color: #fafafa; height: 100%; display: flex; align-items: center; justify-content: center; box-sizing: border-box;'>
+            <p style='margin: 0; color: #333; font-size: 14px;'>Select a language to see resource classification</p>
+            </div>"""
+            return default_html, None, "", None, "", None, "", None, "", None, ""
         # Extract the language code from "code: name" format
         language_code = language_selection.split(":")[0].strip()
+        # Get language name for taxonomy lookup
+        language_name = LANGUAGES.get(language_code, {}).get("name", "")
+        # Get taxonomy classification
+        level, classification = get_language_taxonomy_info(language_name)
+        # Create metadata display with color coding
+        if level is not None:
+            color = get_taxonomy_color(level)
+            metadata_html = f"""<div style='padding: 15px; border: 2px solid {color}; border-radius: 4px; background-color: #ffffff; height: 100%; box-sizing: border-box; display: flex; flex-direction: column; justify-content: center;'>
+            <h4 style='margin: 0 0 8px 0; color: #333; font-size: 16px;'>{language_name}</h4>
+            <div style='margin: 8px 0;'>
+                <span style='padding: 6px 12px; background-color: {color}; color: white; border-radius: 4px; font-weight: bold; font-size: 13px;'>{classification}</span>
+            </div>
+            <p style='margin: 8px 0 0 0; font-size: 11px; color: #555;'>Source: <a href='https://microsoft.github.io/linguisticdiversity/' target='_blank' style='color: #1976d2; text-decoration: none;'>Joshi et al.</a></p>
+            </div>"""
+        else:
+            metadata_html = f"""<div style='padding: 15px; border: 2px solid #757575; border-radius: 4px; background-color: #ffffff; height: 100%; box-sizing: border-box; display: flex; flex-direction: column; justify-content: center;'>
+            <h4 style='margin: 0 0 8px 0; color: #333; font-size: 16px;'>{language_name}</h4>
+            <div style='margin: 8px 0;'>
+                <span style='padding: 6px 12px; background-color: #757575; color: white; border-radius: 4px; font-weight: bold; font-size: 13px;'>Unknown</span>
+            </div>
+            <p style='margin: 8px 0 0 0; font-size: 11px; color: #555;'>Source: <a href='https://microsoft.github.io/linguisticdiversity/' target='_blank' style='color: #1976d2; text-decoration: none;'>Joshi et al.</a></p>
+            </div>"""
         commercial_df, asr_models_df, tts_models_df, asr_models_count, tts_models_count, asr_datasets_df, tts_datasets_df, asr_datasets_count, tts_datasets_count, logs = search_language_resources(language_code, deduplicate=deduplicate)
         # Create count labels
         asr_datasets_label = f"**Found {asr_datasets_count} ASR dataset(s)**"
         tts_datasets_label = f"**Found {tts_datasets_count} TTS dataset(s)**"
+        return metadata_html, commercial_df, asr_models_label, asr_models_df, tts_models_label, tts_models_df, asr_datasets_label, asr_datasets_df, tts_datasets_label, tts_datasets_df, logs
+    # Trigger search when language is selected
     language_dropdown.change(
         fn=on_search,
         inputs=[language_dropdown, deduplicate_checkbox],
+        outputs=[language_metadata, commercial_table, asr_count_label, asr_models_table, tts_count_label, tts_models_table, asr_datasets_count_label, asr_datasets_table, tts_datasets_count_label, tts_datasets_table, log_textbox],
     )
     # Trigger search when deduplicate checkbox is changed
     deduplicate_checkbox.change(
         fn=on_search,
         inputs=[language_dropdown, deduplicate_checkbox],
+        outputs=[language_metadata, commercial_table, asr_count_label, asr_models_table, tts_count_label, tts_models_table, asr_datasets_count_label, asr_datasets_table, tts_datasets_count_label, tts_datasets_table, log_textbox],
     )
 if __name__ == "__main__":

app_content.md CHANGED Viewed

@@ -31,12 +31,25 @@ Commercial service support is automatically pulled from the language support pag
   - [ASR Datasets](https://huggingface.co/datasets?task_categories=task_categories:automatic-speech-recognition)
   - [TTS Datasets](https://huggingface.co/datasets?task_categories=task_categories:text-to-speech)
 ## Disclaimer
-- Currently lists only 487 languages and is taken from this [Github repository](https://github.com/datasets/language-codes).
 - Data fetched in real-time and can change.
-- This is not an exhaustive list. There are other commercial voice technology providers and dataset/model resources that this app doesn't cover.
-- Deduplication discards models with same name uploaded by others and keeps the most downloaded version in the list.
 ## Feedback

   - [ASR Datasets](https://huggingface.co/datasets?task_categories=task_categories:automatic-speech-recognition)
   - [TTS Datasets](https://huggingface.co/datasets?task_categories=task_categories:text-to-speech)
+## Language Resource Classification
+The resource classification shown for each language is based on [Joshi et al.'s 2020 research on linguistic diversity in NLP](https://microsoft.github.io/linguisticdiversity/). This study categorized languages into 6 levels based on their representation in language technology resources:
+- **Level 5: The Winners** - Languages with the most resources
+- **Level 4: The Underdogs** - Languages with moderate resources
+- **Level 3: The Rising Stars** - Languages with growing resources
+- **Level 2: The Hopefuls** - Languages with limited resources
+- **Level 1: The Scraping-Bys** - Languages with very few resources
+- **Level 0: The Left-Behinds** - Languages with almost no resources
+**Note:** This classification is from 2020 research and may not reflect the current state of resources for all languages. The landscape of speech technology is rapidly evolving, and some languages have surely gained more resources since this study was conducted.
 ## Disclaimer
+- The language list only contains 487 languages and is taken from this [Github repository](https://github.com/datasets/language-codes).
 - Data fetched in real-time and can change.
+- This is not an exhaustive list of speech and language technology resources. There are other commercial voice technology providers and dataset/model resources that this app doesn't cover.
+- Model deduplication discards models with same name uploaded by others and keeps only the most downloaded version in the list.
 ## Feedback