Spaces:

OpenHands
/

openhands-index

Running

openhands openhands commited on 19 days ago

Commit

376500e

1 Parent(s): 855423e

Rename columns: Agent→OpenHands Version, Models Used→Language Model, remove Submitter

- Updated simple_data_loader.py to use new column names
- Updated leaderboard_transformer.py with fixed_mappings and scatter plot references
- Updated ui_components.py tooltips and column handling
- Changed metadata.json: agent_name→agent_version across all mock data
- Added 'Bug Fixing' tag to swe-bench benchmarks in scores.json
- Removed 'Submitter' column from base_cols and column references

Co-authored-by: openhands <[email protected]>

Files changed (13) hide show

leaderboard_transformer.py +10 -9
mock_results/1.0.0-dev1/results/20251124_claude_3_5_sonnet_20241022/metadata.json +1 -1
mock_results/1.0.0-dev1/results/20251124_claude_3_5_sonnet_20241022/scores.json +4 -2
mock_results/1.0.0-dev1/results/20251124_claude_3_opus_20240229/metadata.json +1 -1
mock_results/1.0.0-dev1/results/20251124_claude_3_opus_20240229/scores.json +4 -2
mock_results/1.0.0-dev1/results/20251124_gpt_4_turbo_2024_04_09/metadata.json +1 -1
mock_results/1.0.0-dev1/results/20251124_gpt_4_turbo_2024_04_09/scores.json +4 -2
mock_results/1.0.0-dev1/results/20251124_gpt_4o_2024_11_20/metadata.json +1 -1
mock_results/1.0.0-dev1/results/20251124_gpt_4o_2024_11_20/scores.json +4 -2
mock_results/1.0.0-dev1/results/20251124_gpt_4o_mini_2024_07_18/metadata.json +1 -1
mock_results/1.0.0-dev1/results/20251124_gpt_4o_mini_2024_07_18/scores.json +4 -2
simple_data_loader.py +6 -6
ui_components.py +23 -27

leaderboard_transformer.py CHANGED Viewed

@@ -103,7 +103,8 @@ def _pretty_column_name(raw_col: str) -> str:
     # Case 1: Handle fixed, special-case mappings first.
     fixed_mappings = {
         'id': 'id',
-        'Agent': 'Agent Version',
         'Agent description': 'Agent Description',
         'Submission date': 'Date',
         'Overall': 'Overall Score',
@@ -255,7 +256,7 @@ class DataTransformer:
         df_view = df_sorted.copy()
         # --- 3. Add Columns for Agent Openness ---
-        base_cols = ["id","Agent","Submitter","Models Used","Source"]
         new_cols = ["Openness"]
         ending_cols = ["Date", "Logs"]
@@ -304,7 +305,7 @@ class DataTransformer:
                     data=df_view,
                     x=primary_cost_col,
                     y=primary_score_col,
-                    agent_col="Agent",
                     name=primary_metric
                 )
                 # Use a consistent key for easy retrieval later
@@ -347,7 +348,7 @@ def _plot_scatter_plotly(
     x_col_to_use = x
     y_col_to_use = y
-    llm_base = data["Models Used"] if "Models Used" in data.columns else "Models Used"
     # --- Section 2: Data Preparation---
     required_cols = [y_col_to_use, agent_col, "Openness"]
@@ -432,7 +433,7 @@ def _plot_scatter_plotly(
     def format_hover_text(row, agent_col, x_axis_label, x_col, y_col, divider_line_x):
         """
         Builds the complete HTML string for the plot's hover tooltip.
-        Formats the 'Models Used' column as a bulleted list if multiple.
         """
         h_pad = "   "
         parts = ["<br>"]
@@ -447,18 +448,18 @@ def _plot_scatter_plotly(
         # Add extra vertical space (line spacing) before the next section
         parts.append("<br>")
-        # Clean and format Models Used column
-        llm_base_value = row['Models Used']
         llm_base_value = clean_llm_base_list(llm_base_value)
         if isinstance(llm_base_value, list) and llm_base_value:
-            parts.append(f"{h_pad}Models Used:{h_pad}<br>")
             # Create a list of padded bullet points
             list_items = [f"{h_pad}  • <b>{item}</b>{h_pad}" for item in llm_base_value]
             # Join them with line breaks
             parts.append('<br>'.join(list_items))
         else:
             # Handle the non-list case with padding
-            parts.append(f"{h_pad}Models Used: <b>{llm_base_value}</b>{h_pad}")
         # Add a final line break for bottom "padding"
         parts.append("<br>")
         # Join all the parts together into the final HTML string

     # Case 1: Handle fixed, special-case mappings first.
     fixed_mappings = {
         'id': 'id',
+        'Openhands version': 'OpenHands Version',
+        'Language model': 'Language Model',
         'Agent description': 'Agent Description',
         'Submission date': 'Date',
         'Overall': 'Overall Score',
         df_view = df_sorted.copy()
         # --- 3. Add Columns for Agent Openness ---
+        base_cols = ["id","OpenHands Version","Language Model","Source"]
         new_cols = ["Openness"]
         ending_cols = ["Date", "Logs"]
                     data=df_view,
                     x=primary_cost_col,
                     y=primary_score_col,
+                    agent_col="OpenHands Version",
                     name=primary_metric
                 )
                 # Use a consistent key for easy retrieval later
     x_col_to_use = x
     y_col_to_use = y
+    llm_base = data["Language Model"] if "Language Model" in data.columns else "Language Model"
     # --- Section 2: Data Preparation---
     required_cols = [y_col_to_use, agent_col, "Openness"]
     def format_hover_text(row, agent_col, x_axis_label, x_col, y_col, divider_line_x):
         """
         Builds the complete HTML string for the plot's hover tooltip.
+        Formats the 'Language Model' column as a bulleted list if multiple.
         """
         h_pad = "   "
         parts = ["<br>"]
         # Add extra vertical space (line spacing) before the next section
         parts.append("<br>")
+        # Clean and format Language Model column
+        llm_base_value = row['Language Model']
         llm_base_value = clean_llm_base_list(llm_base_value)
         if isinstance(llm_base_value, list) and llm_base_value:
+            parts.append(f"{h_pad}Language Model:{h_pad}<br>")
             # Create a list of padded bullet points
             list_items = [f"{h_pad}  • <b>{item}</b>{h_pad}" for item in llm_base_value]
             # Join them with line breaks
             parts.append('<br>'.join(list_items))
         else:
             # Handle the non-list case with padding
+            parts.append(f"{h_pad}Language Model: <b>{llm_base_value}</b>{h_pad}")
         # Add a final line break for bottom "padding"
         parts.append("<br>")
         # Join all the parts together into the final HTML string

mock_results/1.0.0-dev1/results/20251124_claude_3_5_sonnet_20241022/metadata.json CHANGED Viewed

@@ -1,5 +1,5 @@
 {
-  "agent_name": "OpenHands CodeAct v2.1",
   "agent_version": "OpenHands CodeAct v2.1",
   "model": "claude-3-5-sonnet-20241022",
   "openness": "closed_api_available",

 {
+  "agent_version": "OpenHands CodeAct v2.1",
   "agent_version": "OpenHands CodeAct v2.1",
   "model": "claude-3-5-sonnet-20241022",
   "openness": "closed_api_available",

mock_results/1.0.0-dev1/results/20251124_claude_3_5_sonnet_20241022/scores.json CHANGED Viewed

@@ -6,7 +6,8 @@
     "total_cost": 34.15,
     "total_runtime": 541.5,
     "tags": [
-      "swe-bench"
     ]
   },
   {
@@ -16,7 +17,8 @@
     "total_cost": 31.05,
     "total_runtime": 510.5,
     "tags": [
-      "swe-bench-multimodal"
     ]
   },
   {

     "total_cost": 34.15,
     "total_runtime": 541.5,
     "tags": [
+      "swe-bench",
+      "Bug Fixing"
     ]
   },
   {
     "total_cost": 31.05,
     "total_runtime": 510.5,
     "tags": [
+      "swe-bench-multimodal",
+      "Bug Fixing"
     ]
   },
   {

mock_results/1.0.0-dev1/results/20251124_claude_3_opus_20240229/metadata.json CHANGED Viewed

@@ -1,5 +1,5 @@
 {
-  "agent_name": "SWE-Agent",
   "agent_version": "SWE-Agent",
   "model": "claude-3-opus-20240229",
   "openness": "closed_api_available",

 {
+  "agent_version": "SWE-Agent",
   "agent_version": "SWE-Agent",
   "model": "claude-3-opus-20240229",
   "openness": "closed_api_available",

mock_results/1.0.0-dev1/results/20251124_claude_3_opus_20240229/scores.json CHANGED Viewed

@@ -6,7 +6,8 @@
     "total_cost": 24.9,
     "total_runtime": 449.0,
     "tags": [
-      "swe-bench"
     ]
   },
   {
@@ -16,7 +17,8 @@
     "total_cost": 22.85,
     "total_runtime": 428.5,
     "tags": [
-      "swe-bench-multimodal"
     ]
   },
   {

     "total_cost": 24.9,
     "total_runtime": 449.0,
     "tags": [
+      "swe-bench",
+      "Bug Fixing"
     ]
   },
   {
     "total_cost": 22.85,
     "total_runtime": 428.5,
     "tags": [
+      "swe-bench-multimodal",
+      "Bug Fixing"
     ]
   },
   {

mock_results/1.0.0-dev1/results/20251124_gpt_4_turbo_2024_04_09/metadata.json CHANGED Viewed

@@ -1,5 +1,5 @@
 {
-  "agent_name": "AutoCodeRover",
   "agent_version": "AutoCodeRover",
   "model": "gpt-4-turbo-2024-04-09",
   "openness": "closed_api_available",

 {
+  "agent_version": "AutoCodeRover",
   "agent_version": "AutoCodeRover",
   "model": "gpt-4-turbo-2024-04-09",
   "openness": "closed_api_available",

mock_results/1.0.0-dev1/results/20251124_gpt_4_turbo_2024_04_09/scores.json CHANGED Viewed

@@ -6,7 +6,8 @@
     "total_cost": 29.35,
     "total_runtime": 493.5,
     "tags": [
-      "swe-bench"
     ]
   },
   {
@@ -16,7 +17,8 @@
     "total_cost": 27.1,
     "total_runtime": 471.0,
     "tags": [
-      "swe-bench-multimodal"
     ]
   },
   {

     "total_cost": 29.35,
     "total_runtime": 493.5,
     "tags": [
+      "swe-bench",
+      "Bug Fixing"
     ]
   },
   {
     "total_cost": 27.1,
     "total_runtime": 471.0,
     "tags": [
+      "swe-bench-multimodal",
+      "Bug Fixing"
     ]
   },
   {

mock_results/1.0.0-dev1/results/20251124_gpt_4o_2024_11_20/metadata.json CHANGED Viewed

@@ -1,5 +1,5 @@
 {
-  "agent_name": "OpenHands CodeAct v2.0",
   "agent_version": "OpenHands CodeAct v2.0",
   "model": "gpt-4o-2024-11-20",
   "openness": "closed_api_available",

 {
+  "agent_version": "OpenHands CodeAct v2.0",
   "agent_version": "OpenHands CodeAct v2.0",
   "model": "gpt-4o-2024-11-20",
   "openness": "closed_api_available",

mock_results/1.0.0-dev1/results/20251124_gpt_4o_2024_11_20/scores.json CHANGED Viewed

@@ -6,7 +6,8 @@
     "total_cost": 32.55,
     "total_runtime": 525.5,
     "tags": [
-      "swe-bench"
     ]
   },
   {
@@ -16,7 +17,8 @@
     "total_cost": 29.75,
     "total_runtime": 497.5,
     "tags": [
-      "swe-bench-multimodal"
     ]
   },
   {

     "total_cost": 32.55,
     "total_runtime": 525.5,
     "tags": [
+      "swe-bench",
+      "Bug Fixing"
     ]
   },
   {
     "total_cost": 29.75,
     "total_runtime": 497.5,
     "tags": [
+      "swe-bench-multimodal",
+      "Bug Fixing"
     ]
   },
   {

mock_results/1.0.0-dev1/results/20251124_gpt_4o_mini_2024_07_18/metadata.json CHANGED Viewed

@@ -1,5 +1,5 @@
 {
-  "agent_name": "Agentless",
   "agent_version": "Agentless",
   "model": "gpt-4o-mini-2024-07-18",
   "openness": "closed_api_available",

 {
+  "agent_version": "Agentless",
   "agent_version": "Agentless",
   "model": "gpt-4o-mini-2024-07-18",
   "openness": "closed_api_available",

mock_results/1.0.0-dev1/results/20251124_gpt_4o_mini_2024_07_18/scores.json CHANGED Viewed

@@ -6,7 +6,8 @@
     "total_cost": 26.25,
     "total_runtime": 462.5,
     "tags": [
-      "swe-bench"
     ]
   },
   {
@@ -16,7 +17,8 @@
     "total_cost": 24.45,
     "total_runtime": 444.5,
     "tags": [
-      "swe-bench-multimodal"
     ]
   },
   {

     "total_cost": 26.25,
     "total_runtime": 462.5,
     "tags": [
+      "swe-bench",
+      "Bug Fixing"
     ]
   },
   {
     "total_cost": 24.45,
     "total_runtime": 444.5,
     "tags": [
+      "swe-bench-multimodal",
+      "Bug Fixing"
     ]
   },
   {

simple_data_loader.py CHANGED Viewed

@@ -85,7 +85,7 @@ class SimpleLeaderboardViewer:
             # Create one record per benchmark (mimicking old JSONL format)
             for score_entry in scores:
                 record = {
-                    'agent_name': metadata.get('agent_name', 'Unknown'),
                     'llm_base': metadata.get('model', 'unknown'),
                     'openness': metadata.get('openness', 'unknown'),
                     'submission_time': metadata.get('submission_time', ''),
@@ -119,8 +119,8 @@ class SimpleLeaderboardViewer:
             # Group by agent to aggregate results across datasets
             transformed_records = []
-            for agent_name in df['agent_name'].unique():
-                agent_records = df[df['agent_name'] == agent_name]
                 # Build a single record for this agent
                 first_record = agent_records.iloc[0]
@@ -132,12 +132,12 @@ class SimpleLeaderboardViewer:
                 record = {
                     # Core agent info - use final display names
-                    'agent': agent_name,  # Will become "Agent Version" after prettifying
-                    'models used': first_record['llm_base'],  # Will become "Model"
                     'openness': normalized_openness,  # Will become "Openness" (simplified to "open" or "closed")
                     'date': first_record['submission_time'],  # Will become "Date"
                     # Additional columns expected by the transformer
-                    'id': first_record.get('id', agent_name),  # Will become "Id"
                     'source': first_record.get('source', ''),  # Will become "Source"
                     'logs': first_record.get('logs', ''),  # Will become "Logs"
                 }

             # Create one record per benchmark (mimicking old JSONL format)
             for score_entry in scores:
                 record = {
+                    'agent_version': metadata.get('agent_version', 'Unknown'),
                     'llm_base': metadata.get('model', 'unknown'),
                     'openness': metadata.get('openness', 'unknown'),
                     'submission_time': metadata.get('submission_time', ''),
             # Group by agent to aggregate results across datasets
             transformed_records = []
+            for agent_version in df['agent_version'].unique():
+                agent_records = df[df['agent_version'] == agent_version]
                 # Build a single record for this agent
                 first_record = agent_records.iloc[0]
                 record = {
                     # Core agent info - use final display names
+                    'openhands version': agent_version,  # Will become "OpenHands Version"
+                    'language model': first_record['llm_base'],  # Will become "Language Model"
                     'openness': normalized_openness,  # Will become "Openness" (simplified to "open" or "closed")
                     'date': first_record['submission_time'],  # Will become "Date"
                     # Additional columns expected by the transformer
+                    'id': first_record.get('id', agent_version),  # Will become "Id"
                     'source': first_record.get('source', ''),  # Will become "Source"
                     'logs': first_record.get('logs', ''),  # Will become "Logs"
                 }

ui_components.py CHANGED Viewed

@@ -147,9 +147,8 @@ def build_descriptions_tooltip_content(table) -> str:
     """Generates the inner HTML for the Column Descriptions tooltip card depending on which kind of table."""
     if table == "Overall":
         return """
-            <div class="tooltip-description-item"><b>Agent:</b> Name of the evaluated agent.</div>
-            <div class="tooltip-description-item"><b>Submitter:</b> Organization or individual who submitted the agent for evaluation.</div>
-            <div class="tooltip-description-item"><b>Models Used:</b> Model(s) used by the agent. Hover over ⓘ to view all.</div>
             <div class="tooltip-description-item"><b>Overall Score:</b> Macro-average of the four category-level average scores. Each category contributes equally.</div>
             <div class="tooltip-description-item"><b>Overall Cost:</b> Macro-average cost per problem across all categories, in USD. Each category contributes equally.</div>
             <div class="tooltip-description-item"><b>Literature Understanding Score:</b> Macro-average score across Literature Understanding benchmarks.</div>
@@ -165,9 +164,8 @@ def build_descriptions_tooltip_content(table) -> str:
         """
     elif table in ["Literature Understanding", "Code & Execution", "Data Analysis", "End-to-End Discovery"]:
         return f"""
-            <div class="tooltip-description-item"><b>Agent:</b> Name of the evaluated agent.</div>
-            <div class="tooltip-description-item"><b>Submitter:</b> Organization or individual who submitted the agent for evaluation.</div>
-            <div class="tooltip-description-item"><b>Models Used:</b> Model(s) used by the agent. Hover over ⓘ to view all.</div>
             <div class="tooltip-description-item"><b>{table} Score:</b> Macro-average score across {table} benchmarks.</div>
             <div class="tooltip-description-item"><b>{table} Cost:</b> Macro-average cost per problem (USD) across {table} benchmarks.</div>
             <div class="tooltip-description-item"><b>Benchmark Score:</b> Average (mean) score on the benchmark.</div>
@@ -178,9 +176,8 @@ def build_descriptions_tooltip_content(table) -> str:
     else:
         # Fallback for any other table type, e.g., individual benchmarks
         return f"""
-            <div class="tooltip-description-item"><b>Agent:</b> Name of the evaluated agent.</div>
-            <div class="tooltip-description-item"><b>Submitter:</b> Organization or individual who submitted the agent for evaluation.</div>
-            <div class="tooltip-description-item"><b>Models Used:</b> Model(s) used by the agent. Hover over ⓘ to view all.</div>
             <div class="tooltip-description-item"><b>Benchmark Attempted:</b> Indicates whether the agent attempted this benchmark.</div>
             <div class="tooltip-description-item"><b>{table} Score:</b> Score achieved by the agent on this benchmark.</div>
             <div class="tooltip-description-item"><b>{table} Cost:</b> Cost incurred by the agent to solve this benchmark (in USD).</div>
@@ -392,13 +389,13 @@ def create_leaderboard_display(
         if "Score" in col:
             df_view = format_score_column(df_view, col)
     scatter_plot = plots_dict.get('scatter_plot', go.Figure())
-    #Make pretty and format the Models Used column
-    df_view['Models Used'] = df_view['Models Used'].apply(clean_llm_base_list)
-    df_view['Models Used'] = df_view['Models Used'].apply(format_llm_base_with_html)
-    # append the repro url to the end of the agent name
     if 'Source' in df_view.columns:
-        df_view['Agent'] = df_view.apply(
-            lambda row: f"{row['Agent']} {row['Source']}" if pd.notna(row['Source']) and row['Source'] else row['Agent'],
             axis=1
         )
@@ -416,7 +413,7 @@ def create_leaderboard_display(
     for col in df_headers:
         if col == "Logs" or "Cost" in col or "Score" in col:
             df_datatypes.append("markdown")
-        elif col in ["Agent","Icon","Models Used", "Pareto"]:
             df_datatypes.append("html")
         else:
             df_datatypes.append("str")
@@ -508,7 +505,7 @@ def create_benchmark_details_display(
         benchmark_cost_col = f"{benchmark_name} Cost"
         # Define the columns needed for the detailed table
-        table_cols = ['Agent','Source','Openness', 'Date', benchmark_score_col, benchmark_cost_col,'Logs','id', 'Models Used']
         # Filter to only columns that actually exist in the full dataframe
         existing_table_cols = [col for col in table_cols if col in full_df.columns]
@@ -540,13 +537,13 @@ def create_benchmark_details_display(
         benchmark_table_df['Icon'] = benchmark_table_df.apply(get_openness_icon_html, axis=1)
-        #Make pretty and format the Models Used column
-        benchmark_table_df['Models Used'] = benchmark_table_df['Models Used'].apply(clean_llm_base_list)
-        benchmark_table_df['Models Used'] = benchmark_table_df['Models Used'].apply(format_llm_base_with_html)
-        # append the repro url to the end of the agent name
         if 'Source' in benchmark_table_df.columns:
-            benchmark_table_df['Agent'] = benchmark_table_df.apply(
-                lambda row: f"{row['Agent']} {row['Source']}" if row['Source'] else row['Agent'],
                 axis=1
             )
@@ -573,9 +570,8 @@ def create_benchmark_details_display(
         desired_cols_in_order = [
             'Pareto',
             'Icon',
-            'Agent',
-            'Submitter',
-            'Models Used',
             'Attempted Benchmark',
             benchmark_score_col,
             benchmark_cost_col,
@@ -597,7 +593,7 @@ def create_benchmark_details_display(
         for col in df_headers:
             if "Logs" in col or "Cost" in col or "Score" in col:
                 df_datatypes.append("markdown")
-            elif col in ["Agent", "Icon", "Models Used", "Pareto"]:
                 df_datatypes.append("html")
             else:
                 df_datatypes.append("str")

     """Generates the inner HTML for the Column Descriptions tooltip card depending on which kind of table."""
     if table == "Overall":
         return """
+            <div class="tooltip-description-item"><b>OpenHands Version:</b> Version of the OpenHands agent evaluated.</div>
+            <div class="tooltip-description-item"><b>Language Model:</b> Language model(s) used by the agent. Hover over ⓘ to view all.</div>
             <div class="tooltip-description-item"><b>Overall Score:</b> Macro-average of the four category-level average scores. Each category contributes equally.</div>
             <div class="tooltip-description-item"><b>Overall Cost:</b> Macro-average cost per problem across all categories, in USD. Each category contributes equally.</div>
             <div class="tooltip-description-item"><b>Literature Understanding Score:</b> Macro-average score across Literature Understanding benchmarks.</div>
         """
     elif table in ["Literature Understanding", "Code & Execution", "Data Analysis", "End-to-End Discovery"]:
         return f"""
+            <div class="tooltip-description-item"><b>OpenHands Version:</b> Version of the OpenHands agent evaluated.</div>
+            <div class="tooltip-description-item"><b>Language Model:</b> Language model(s) used by the agent. Hover over ⓘ to view all.</div>
             <div class="tooltip-description-item"><b>{table} Score:</b> Macro-average score across {table} benchmarks.</div>
             <div class="tooltip-description-item"><b>{table} Cost:</b> Macro-average cost per problem (USD) across {table} benchmarks.</div>
             <div class="tooltip-description-item"><b>Benchmark Score:</b> Average (mean) score on the benchmark.</div>
     else:
         # Fallback for any other table type, e.g., individual benchmarks
         return f"""
+            <div class="tooltip-description-item"><b>OpenHands Version:</b> Version of the OpenHands agent evaluated.</div>
+            <div class="tooltip-description-item"><b>Language Model:</b> Language model(s) used by the agent. Hover over ⓘ to view all.</div>
             <div class="tooltip-description-item"><b>Benchmark Attempted:</b> Indicates whether the agent attempted this benchmark.</div>
             <div class="tooltip-description-item"><b>{table} Score:</b> Score achieved by the agent on this benchmark.</div>
             <div class="tooltip-description-item"><b>{table} Cost:</b> Cost incurred by the agent to solve this benchmark (in USD).</div>
         if "Score" in col:
             df_view = format_score_column(df_view, col)
     scatter_plot = plots_dict.get('scatter_plot', go.Figure())
+    #Make pretty and format the Language Model column
+    df_view['Language Model'] = df_view['Language Model'].apply(clean_llm_base_list)
+    df_view['Language Model'] = df_view['Language Model'].apply(format_llm_base_with_html)
+    # append the repro url to the end of the OpenHands Version
     if 'Source' in df_view.columns:
+        df_view['OpenHands Version'] = df_view.apply(
+            lambda row: f"{row['OpenHands Version']} {row['Source']}" if pd.notna(row['Source']) and row['Source'] else row['OpenHands Version'],
             axis=1
         )
     for col in df_headers:
         if col == "Logs" or "Cost" in col or "Score" in col:
             df_datatypes.append("markdown")
+        elif col in ["OpenHands Version","Icon","Language Model", "Pareto"]:
             df_datatypes.append("html")
         else:
             df_datatypes.append("str")
         benchmark_cost_col = f"{benchmark_name} Cost"
         # Define the columns needed for the detailed table
+        table_cols = ['OpenHands Version','Source','Openness', 'Date', benchmark_score_col, benchmark_cost_col,'Logs','id', 'Language Model']
         # Filter to only columns that actually exist in the full dataframe
         existing_table_cols = [col for col in table_cols if col in full_df.columns]
         benchmark_table_df['Icon'] = benchmark_table_df.apply(get_openness_icon_html, axis=1)
+        #Make pretty and format the Language Model column
+        benchmark_table_df['Language Model'] = benchmark_table_df['Language Model'].apply(clean_llm_base_list)
+        benchmark_table_df['Language Model'] = benchmark_table_df['Language Model'].apply(format_llm_base_with_html)
+        # append the repro url to the end of the OpenHands Version
         if 'Source' in benchmark_table_df.columns:
+            benchmark_table_df['OpenHands Version'] = benchmark_table_df.apply(
+                lambda row: f"{row['OpenHands Version']} {row['Source']}" if row['Source'] else row['OpenHands Version'],
                 axis=1
             )
         desired_cols_in_order = [
             'Pareto',
             'Icon',
+            'OpenHands Version',
+            'Language Model',
             'Attempted Benchmark',
             benchmark_score_col,
             benchmark_cost_col,
         for col in df_headers:
             if "Logs" in col or "Cost" in col or "Score" in col:
                 df_datatypes.append("markdown")
+            elif col in ["OpenHands Version", "Icon", "Language Model", "Pareto"]:
                 df_datatypes.append("html")
             else:
                 df_datatypes.append("str")