Spaces:
Running
Running
openhands
openhands
commited on
Commit
·
376500e
1
Parent(s):
855423e
Rename columns: Agent→OpenHands Version, Models Used→Language Model, remove Submitter
Browse files- Updated simple_data_loader.py to use new column names
- Updated leaderboard_transformer.py with fixed_mappings and scatter plot references
- Updated ui_components.py tooltips and column handling
- Changed metadata.json: agent_name→agent_version across all mock data
- Added 'Bug Fixing' tag to swe-bench benchmarks in scores.json
- Removed 'Submitter' column from base_cols and column references
Co-authored-by: openhands <[email protected]>
- leaderboard_transformer.py +10 -9
- mock_results/1.0.0-dev1/results/20251124_claude_3_5_sonnet_20241022/metadata.json +1 -1
- mock_results/1.0.0-dev1/results/20251124_claude_3_5_sonnet_20241022/scores.json +4 -2
- mock_results/1.0.0-dev1/results/20251124_claude_3_opus_20240229/metadata.json +1 -1
- mock_results/1.0.0-dev1/results/20251124_claude_3_opus_20240229/scores.json +4 -2
- mock_results/1.0.0-dev1/results/20251124_gpt_4_turbo_2024_04_09/metadata.json +1 -1
- mock_results/1.0.0-dev1/results/20251124_gpt_4_turbo_2024_04_09/scores.json +4 -2
- mock_results/1.0.0-dev1/results/20251124_gpt_4o_2024_11_20/metadata.json +1 -1
- mock_results/1.0.0-dev1/results/20251124_gpt_4o_2024_11_20/scores.json +4 -2
- mock_results/1.0.0-dev1/results/20251124_gpt_4o_mini_2024_07_18/metadata.json +1 -1
- mock_results/1.0.0-dev1/results/20251124_gpt_4o_mini_2024_07_18/scores.json +4 -2
- simple_data_loader.py +6 -6
- ui_components.py +23 -27
leaderboard_transformer.py
CHANGED
|
@@ -103,7 +103,8 @@ def _pretty_column_name(raw_col: str) -> str:
|
|
| 103 |
# Case 1: Handle fixed, special-case mappings first.
|
| 104 |
fixed_mappings = {
|
| 105 |
'id': 'id',
|
| 106 |
-
'
|
|
|
|
| 107 |
'Agent description': 'Agent Description',
|
| 108 |
'Submission date': 'Date',
|
| 109 |
'Overall': 'Overall Score',
|
|
@@ -255,7 +256,7 @@ class DataTransformer:
|
|
| 255 |
df_view = df_sorted.copy()
|
| 256 |
|
| 257 |
# --- 3. Add Columns for Agent Openness ---
|
| 258 |
-
base_cols = ["id","
|
| 259 |
new_cols = ["Openness"]
|
| 260 |
ending_cols = ["Date", "Logs"]
|
| 261 |
|
|
@@ -304,7 +305,7 @@ class DataTransformer:
|
|
| 304 |
data=df_view,
|
| 305 |
x=primary_cost_col,
|
| 306 |
y=primary_score_col,
|
| 307 |
-
agent_col="
|
| 308 |
name=primary_metric
|
| 309 |
)
|
| 310 |
# Use a consistent key for easy retrieval later
|
|
@@ -347,7 +348,7 @@ def _plot_scatter_plotly(
|
|
| 347 |
|
| 348 |
x_col_to_use = x
|
| 349 |
y_col_to_use = y
|
| 350 |
-
llm_base = data["
|
| 351 |
|
| 352 |
# --- Section 2: Data Preparation---
|
| 353 |
required_cols = [y_col_to_use, agent_col, "Openness"]
|
|
@@ -432,7 +433,7 @@ def _plot_scatter_plotly(
|
|
| 432 |
def format_hover_text(row, agent_col, x_axis_label, x_col, y_col, divider_line_x):
|
| 433 |
"""
|
| 434 |
Builds the complete HTML string for the plot's hover tooltip.
|
| 435 |
-
Formats the '
|
| 436 |
"""
|
| 437 |
h_pad = " "
|
| 438 |
parts = ["<br>"]
|
|
@@ -447,18 +448,18 @@ def _plot_scatter_plotly(
|
|
| 447 |
|
| 448 |
# Add extra vertical space (line spacing) before the next section
|
| 449 |
parts.append("<br>")
|
| 450 |
-
# Clean and format
|
| 451 |
-
llm_base_value = row['
|
| 452 |
llm_base_value = clean_llm_base_list(llm_base_value)
|
| 453 |
if isinstance(llm_base_value, list) and llm_base_value:
|
| 454 |
-
parts.append(f"{h_pad}
|
| 455 |
# Create a list of padded bullet points
|
| 456 |
list_items = [f"{h_pad} • <b>{item}</b>{h_pad}" for item in llm_base_value]
|
| 457 |
# Join them with line breaks
|
| 458 |
parts.append('<br>'.join(list_items))
|
| 459 |
else:
|
| 460 |
# Handle the non-list case with padding
|
| 461 |
-
parts.append(f"{h_pad}
|
| 462 |
# Add a final line break for bottom "padding"
|
| 463 |
parts.append("<br>")
|
| 464 |
# Join all the parts together into the final HTML string
|
|
|
|
| 103 |
# Case 1: Handle fixed, special-case mappings first.
|
| 104 |
fixed_mappings = {
|
| 105 |
'id': 'id',
|
| 106 |
+
'Openhands version': 'OpenHands Version',
|
| 107 |
+
'Language model': 'Language Model',
|
| 108 |
'Agent description': 'Agent Description',
|
| 109 |
'Submission date': 'Date',
|
| 110 |
'Overall': 'Overall Score',
|
|
|
|
| 256 |
df_view = df_sorted.copy()
|
| 257 |
|
| 258 |
# --- 3. Add Columns for Agent Openness ---
|
| 259 |
+
base_cols = ["id","OpenHands Version","Language Model","Source"]
|
| 260 |
new_cols = ["Openness"]
|
| 261 |
ending_cols = ["Date", "Logs"]
|
| 262 |
|
|
|
|
| 305 |
data=df_view,
|
| 306 |
x=primary_cost_col,
|
| 307 |
y=primary_score_col,
|
| 308 |
+
agent_col="OpenHands Version",
|
| 309 |
name=primary_metric
|
| 310 |
)
|
| 311 |
# Use a consistent key for easy retrieval later
|
|
|
|
| 348 |
|
| 349 |
x_col_to_use = x
|
| 350 |
y_col_to_use = y
|
| 351 |
+
llm_base = data["Language Model"] if "Language Model" in data.columns else "Language Model"
|
| 352 |
|
| 353 |
# --- Section 2: Data Preparation---
|
| 354 |
required_cols = [y_col_to_use, agent_col, "Openness"]
|
|
|
|
| 433 |
def format_hover_text(row, agent_col, x_axis_label, x_col, y_col, divider_line_x):
|
| 434 |
"""
|
| 435 |
Builds the complete HTML string for the plot's hover tooltip.
|
| 436 |
+
Formats the 'Language Model' column as a bulleted list if multiple.
|
| 437 |
"""
|
| 438 |
h_pad = " "
|
| 439 |
parts = ["<br>"]
|
|
|
|
| 448 |
|
| 449 |
# Add extra vertical space (line spacing) before the next section
|
| 450 |
parts.append("<br>")
|
| 451 |
+
# Clean and format Language Model column
|
| 452 |
+
llm_base_value = row['Language Model']
|
| 453 |
llm_base_value = clean_llm_base_list(llm_base_value)
|
| 454 |
if isinstance(llm_base_value, list) and llm_base_value:
|
| 455 |
+
parts.append(f"{h_pad}Language Model:{h_pad}<br>")
|
| 456 |
# Create a list of padded bullet points
|
| 457 |
list_items = [f"{h_pad} • <b>{item}</b>{h_pad}" for item in llm_base_value]
|
| 458 |
# Join them with line breaks
|
| 459 |
parts.append('<br>'.join(list_items))
|
| 460 |
else:
|
| 461 |
# Handle the non-list case with padding
|
| 462 |
+
parts.append(f"{h_pad}Language Model: <b>{llm_base_value}</b>{h_pad}")
|
| 463 |
# Add a final line break for bottom "padding"
|
| 464 |
parts.append("<br>")
|
| 465 |
# Join all the parts together into the final HTML string
|
mock_results/1.0.0-dev1/results/20251124_claude_3_5_sonnet_20241022/metadata.json
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
{
|
| 2 |
-
"
|
| 3 |
"agent_version": "OpenHands CodeAct v2.1",
|
| 4 |
"model": "claude-3-5-sonnet-20241022",
|
| 5 |
"openness": "closed_api_available",
|
|
|
|
| 1 |
{
|
| 2 |
+
"agent_version": "OpenHands CodeAct v2.1",
|
| 3 |
"agent_version": "OpenHands CodeAct v2.1",
|
| 4 |
"model": "claude-3-5-sonnet-20241022",
|
| 5 |
"openness": "closed_api_available",
|
mock_results/1.0.0-dev1/results/20251124_claude_3_5_sonnet_20241022/scores.json
CHANGED
|
@@ -6,7 +6,8 @@
|
|
| 6 |
"total_cost": 34.15,
|
| 7 |
"total_runtime": 541.5,
|
| 8 |
"tags": [
|
| 9 |
-
"swe-bench"
|
|
|
|
| 10 |
]
|
| 11 |
},
|
| 12 |
{
|
|
@@ -16,7 +17,8 @@
|
|
| 16 |
"total_cost": 31.05,
|
| 17 |
"total_runtime": 510.5,
|
| 18 |
"tags": [
|
| 19 |
-
"swe-bench-multimodal"
|
|
|
|
| 20 |
]
|
| 21 |
},
|
| 22 |
{
|
|
|
|
| 6 |
"total_cost": 34.15,
|
| 7 |
"total_runtime": 541.5,
|
| 8 |
"tags": [
|
| 9 |
+
"swe-bench",
|
| 10 |
+
"Bug Fixing"
|
| 11 |
]
|
| 12 |
},
|
| 13 |
{
|
|
|
|
| 17 |
"total_cost": 31.05,
|
| 18 |
"total_runtime": 510.5,
|
| 19 |
"tags": [
|
| 20 |
+
"swe-bench-multimodal",
|
| 21 |
+
"Bug Fixing"
|
| 22 |
]
|
| 23 |
},
|
| 24 |
{
|
mock_results/1.0.0-dev1/results/20251124_claude_3_opus_20240229/metadata.json
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
{
|
| 2 |
-
"
|
| 3 |
"agent_version": "SWE-Agent",
|
| 4 |
"model": "claude-3-opus-20240229",
|
| 5 |
"openness": "closed_api_available",
|
|
|
|
| 1 |
{
|
| 2 |
+
"agent_version": "SWE-Agent",
|
| 3 |
"agent_version": "SWE-Agent",
|
| 4 |
"model": "claude-3-opus-20240229",
|
| 5 |
"openness": "closed_api_available",
|
mock_results/1.0.0-dev1/results/20251124_claude_3_opus_20240229/scores.json
CHANGED
|
@@ -6,7 +6,8 @@
|
|
| 6 |
"total_cost": 24.9,
|
| 7 |
"total_runtime": 449.0,
|
| 8 |
"tags": [
|
| 9 |
-
"swe-bench"
|
|
|
|
| 10 |
]
|
| 11 |
},
|
| 12 |
{
|
|
@@ -16,7 +17,8 @@
|
|
| 16 |
"total_cost": 22.85,
|
| 17 |
"total_runtime": 428.5,
|
| 18 |
"tags": [
|
| 19 |
-
"swe-bench-multimodal"
|
|
|
|
| 20 |
]
|
| 21 |
},
|
| 22 |
{
|
|
|
|
| 6 |
"total_cost": 24.9,
|
| 7 |
"total_runtime": 449.0,
|
| 8 |
"tags": [
|
| 9 |
+
"swe-bench",
|
| 10 |
+
"Bug Fixing"
|
| 11 |
]
|
| 12 |
},
|
| 13 |
{
|
|
|
|
| 17 |
"total_cost": 22.85,
|
| 18 |
"total_runtime": 428.5,
|
| 19 |
"tags": [
|
| 20 |
+
"swe-bench-multimodal",
|
| 21 |
+
"Bug Fixing"
|
| 22 |
]
|
| 23 |
},
|
| 24 |
{
|
mock_results/1.0.0-dev1/results/20251124_gpt_4_turbo_2024_04_09/metadata.json
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
{
|
| 2 |
-
"
|
| 3 |
"agent_version": "AutoCodeRover",
|
| 4 |
"model": "gpt-4-turbo-2024-04-09",
|
| 5 |
"openness": "closed_api_available",
|
|
|
|
| 1 |
{
|
| 2 |
+
"agent_version": "AutoCodeRover",
|
| 3 |
"agent_version": "AutoCodeRover",
|
| 4 |
"model": "gpt-4-turbo-2024-04-09",
|
| 5 |
"openness": "closed_api_available",
|
mock_results/1.0.0-dev1/results/20251124_gpt_4_turbo_2024_04_09/scores.json
CHANGED
|
@@ -6,7 +6,8 @@
|
|
| 6 |
"total_cost": 29.35,
|
| 7 |
"total_runtime": 493.5,
|
| 8 |
"tags": [
|
| 9 |
-
"swe-bench"
|
|
|
|
| 10 |
]
|
| 11 |
},
|
| 12 |
{
|
|
@@ -16,7 +17,8 @@
|
|
| 16 |
"total_cost": 27.1,
|
| 17 |
"total_runtime": 471.0,
|
| 18 |
"tags": [
|
| 19 |
-
"swe-bench-multimodal"
|
|
|
|
| 20 |
]
|
| 21 |
},
|
| 22 |
{
|
|
|
|
| 6 |
"total_cost": 29.35,
|
| 7 |
"total_runtime": 493.5,
|
| 8 |
"tags": [
|
| 9 |
+
"swe-bench",
|
| 10 |
+
"Bug Fixing"
|
| 11 |
]
|
| 12 |
},
|
| 13 |
{
|
|
|
|
| 17 |
"total_cost": 27.1,
|
| 18 |
"total_runtime": 471.0,
|
| 19 |
"tags": [
|
| 20 |
+
"swe-bench-multimodal",
|
| 21 |
+
"Bug Fixing"
|
| 22 |
]
|
| 23 |
},
|
| 24 |
{
|
mock_results/1.0.0-dev1/results/20251124_gpt_4o_2024_11_20/metadata.json
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
{
|
| 2 |
-
"
|
| 3 |
"agent_version": "OpenHands CodeAct v2.0",
|
| 4 |
"model": "gpt-4o-2024-11-20",
|
| 5 |
"openness": "closed_api_available",
|
|
|
|
| 1 |
{
|
| 2 |
+
"agent_version": "OpenHands CodeAct v2.0",
|
| 3 |
"agent_version": "OpenHands CodeAct v2.0",
|
| 4 |
"model": "gpt-4o-2024-11-20",
|
| 5 |
"openness": "closed_api_available",
|
mock_results/1.0.0-dev1/results/20251124_gpt_4o_2024_11_20/scores.json
CHANGED
|
@@ -6,7 +6,8 @@
|
|
| 6 |
"total_cost": 32.55,
|
| 7 |
"total_runtime": 525.5,
|
| 8 |
"tags": [
|
| 9 |
-
"swe-bench"
|
|
|
|
| 10 |
]
|
| 11 |
},
|
| 12 |
{
|
|
@@ -16,7 +17,8 @@
|
|
| 16 |
"total_cost": 29.75,
|
| 17 |
"total_runtime": 497.5,
|
| 18 |
"tags": [
|
| 19 |
-
"swe-bench-multimodal"
|
|
|
|
| 20 |
]
|
| 21 |
},
|
| 22 |
{
|
|
|
|
| 6 |
"total_cost": 32.55,
|
| 7 |
"total_runtime": 525.5,
|
| 8 |
"tags": [
|
| 9 |
+
"swe-bench",
|
| 10 |
+
"Bug Fixing"
|
| 11 |
]
|
| 12 |
},
|
| 13 |
{
|
|
|
|
| 17 |
"total_cost": 29.75,
|
| 18 |
"total_runtime": 497.5,
|
| 19 |
"tags": [
|
| 20 |
+
"swe-bench-multimodal",
|
| 21 |
+
"Bug Fixing"
|
| 22 |
]
|
| 23 |
},
|
| 24 |
{
|
mock_results/1.0.0-dev1/results/20251124_gpt_4o_mini_2024_07_18/metadata.json
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
{
|
| 2 |
-
"
|
| 3 |
"agent_version": "Agentless",
|
| 4 |
"model": "gpt-4o-mini-2024-07-18",
|
| 5 |
"openness": "closed_api_available",
|
|
|
|
| 1 |
{
|
| 2 |
+
"agent_version": "Agentless",
|
| 3 |
"agent_version": "Agentless",
|
| 4 |
"model": "gpt-4o-mini-2024-07-18",
|
| 5 |
"openness": "closed_api_available",
|
mock_results/1.0.0-dev1/results/20251124_gpt_4o_mini_2024_07_18/scores.json
CHANGED
|
@@ -6,7 +6,8 @@
|
|
| 6 |
"total_cost": 26.25,
|
| 7 |
"total_runtime": 462.5,
|
| 8 |
"tags": [
|
| 9 |
-
"swe-bench"
|
|
|
|
| 10 |
]
|
| 11 |
},
|
| 12 |
{
|
|
@@ -16,7 +17,8 @@
|
|
| 16 |
"total_cost": 24.45,
|
| 17 |
"total_runtime": 444.5,
|
| 18 |
"tags": [
|
| 19 |
-
"swe-bench-multimodal"
|
|
|
|
| 20 |
]
|
| 21 |
},
|
| 22 |
{
|
|
|
|
| 6 |
"total_cost": 26.25,
|
| 7 |
"total_runtime": 462.5,
|
| 8 |
"tags": [
|
| 9 |
+
"swe-bench",
|
| 10 |
+
"Bug Fixing"
|
| 11 |
]
|
| 12 |
},
|
| 13 |
{
|
|
|
|
| 17 |
"total_cost": 24.45,
|
| 18 |
"total_runtime": 444.5,
|
| 19 |
"tags": [
|
| 20 |
+
"swe-bench-multimodal",
|
| 21 |
+
"Bug Fixing"
|
| 22 |
]
|
| 23 |
},
|
| 24 |
{
|
simple_data_loader.py
CHANGED
|
@@ -85,7 +85,7 @@ class SimpleLeaderboardViewer:
|
|
| 85 |
# Create one record per benchmark (mimicking old JSONL format)
|
| 86 |
for score_entry in scores:
|
| 87 |
record = {
|
| 88 |
-
'
|
| 89 |
'llm_base': metadata.get('model', 'unknown'),
|
| 90 |
'openness': metadata.get('openness', 'unknown'),
|
| 91 |
'submission_time': metadata.get('submission_time', ''),
|
|
@@ -119,8 +119,8 @@ class SimpleLeaderboardViewer:
|
|
| 119 |
# Group by agent to aggregate results across datasets
|
| 120 |
transformed_records = []
|
| 121 |
|
| 122 |
-
for
|
| 123 |
-
agent_records = df[df['
|
| 124 |
|
| 125 |
# Build a single record for this agent
|
| 126 |
first_record = agent_records.iloc[0]
|
|
@@ -132,12 +132,12 @@ class SimpleLeaderboardViewer:
|
|
| 132 |
|
| 133 |
record = {
|
| 134 |
# Core agent info - use final display names
|
| 135 |
-
'
|
| 136 |
-
'
|
| 137 |
'openness': normalized_openness, # Will become "Openness" (simplified to "open" or "closed")
|
| 138 |
'date': first_record['submission_time'], # Will become "Date"
|
| 139 |
# Additional columns expected by the transformer
|
| 140 |
-
'id': first_record.get('id',
|
| 141 |
'source': first_record.get('source', ''), # Will become "Source"
|
| 142 |
'logs': first_record.get('logs', ''), # Will become "Logs"
|
| 143 |
}
|
|
|
|
| 85 |
# Create one record per benchmark (mimicking old JSONL format)
|
| 86 |
for score_entry in scores:
|
| 87 |
record = {
|
| 88 |
+
'agent_version': metadata.get('agent_version', 'Unknown'),
|
| 89 |
'llm_base': metadata.get('model', 'unknown'),
|
| 90 |
'openness': metadata.get('openness', 'unknown'),
|
| 91 |
'submission_time': metadata.get('submission_time', ''),
|
|
|
|
| 119 |
# Group by agent to aggregate results across datasets
|
| 120 |
transformed_records = []
|
| 121 |
|
| 122 |
+
for agent_version in df['agent_version'].unique():
|
| 123 |
+
agent_records = df[df['agent_version'] == agent_version]
|
| 124 |
|
| 125 |
# Build a single record for this agent
|
| 126 |
first_record = agent_records.iloc[0]
|
|
|
|
| 132 |
|
| 133 |
record = {
|
| 134 |
# Core agent info - use final display names
|
| 135 |
+
'openhands version': agent_version, # Will become "OpenHands Version"
|
| 136 |
+
'language model': first_record['llm_base'], # Will become "Language Model"
|
| 137 |
'openness': normalized_openness, # Will become "Openness" (simplified to "open" or "closed")
|
| 138 |
'date': first_record['submission_time'], # Will become "Date"
|
| 139 |
# Additional columns expected by the transformer
|
| 140 |
+
'id': first_record.get('id', agent_version), # Will become "Id"
|
| 141 |
'source': first_record.get('source', ''), # Will become "Source"
|
| 142 |
'logs': first_record.get('logs', ''), # Will become "Logs"
|
| 143 |
}
|
ui_components.py
CHANGED
|
@@ -147,9 +147,8 @@ def build_descriptions_tooltip_content(table) -> str:
|
|
| 147 |
"""Generates the inner HTML for the Column Descriptions tooltip card depending on which kind of table."""
|
| 148 |
if table == "Overall":
|
| 149 |
return """
|
| 150 |
-
<div class="tooltip-description-item"><b>
|
| 151 |
-
<div class="tooltip-description-item"><b>
|
| 152 |
-
<div class="tooltip-description-item"><b>Models Used:</b> Model(s) used by the agent. Hover over ⓘ to view all.</div>
|
| 153 |
<div class="tooltip-description-item"><b>Overall Score:</b> Macro-average of the four category-level average scores. Each category contributes equally.</div>
|
| 154 |
<div class="tooltip-description-item"><b>Overall Cost:</b> Macro-average cost per problem across all categories, in USD. Each category contributes equally.</div>
|
| 155 |
<div class="tooltip-description-item"><b>Literature Understanding Score:</b> Macro-average score across Literature Understanding benchmarks.</div>
|
|
@@ -165,9 +164,8 @@ def build_descriptions_tooltip_content(table) -> str:
|
|
| 165 |
"""
|
| 166 |
elif table in ["Literature Understanding", "Code & Execution", "Data Analysis", "End-to-End Discovery"]:
|
| 167 |
return f"""
|
| 168 |
-
<div class="tooltip-description-item"><b>
|
| 169 |
-
<div class="tooltip-description-item"><b>
|
| 170 |
-
<div class="tooltip-description-item"><b>Models Used:</b> Model(s) used by the agent. Hover over ⓘ to view all.</div>
|
| 171 |
<div class="tooltip-description-item"><b>{table} Score:</b> Macro-average score across {table} benchmarks.</div>
|
| 172 |
<div class="tooltip-description-item"><b>{table} Cost:</b> Macro-average cost per problem (USD) across {table} benchmarks.</div>
|
| 173 |
<div class="tooltip-description-item"><b>Benchmark Score:</b> Average (mean) score on the benchmark.</div>
|
|
@@ -178,9 +176,8 @@ def build_descriptions_tooltip_content(table) -> str:
|
|
| 178 |
else:
|
| 179 |
# Fallback for any other table type, e.g., individual benchmarks
|
| 180 |
return f"""
|
| 181 |
-
<div class="tooltip-description-item"><b>
|
| 182 |
-
<div class="tooltip-description-item"><b>
|
| 183 |
-
<div class="tooltip-description-item"><b>Models Used:</b> Model(s) used by the agent. Hover over ⓘ to view all.</div>
|
| 184 |
<div class="tooltip-description-item"><b>Benchmark Attempted:</b> Indicates whether the agent attempted this benchmark.</div>
|
| 185 |
<div class="tooltip-description-item"><b>{table} Score:</b> Score achieved by the agent on this benchmark.</div>
|
| 186 |
<div class="tooltip-description-item"><b>{table} Cost:</b> Cost incurred by the agent to solve this benchmark (in USD).</div>
|
|
@@ -392,13 +389,13 @@ def create_leaderboard_display(
|
|
| 392 |
if "Score" in col:
|
| 393 |
df_view = format_score_column(df_view, col)
|
| 394 |
scatter_plot = plots_dict.get('scatter_plot', go.Figure())
|
| 395 |
-
#Make pretty and format the
|
| 396 |
-
df_view['
|
| 397 |
-
df_view['
|
| 398 |
-
# append the repro url to the end of the
|
| 399 |
if 'Source' in df_view.columns:
|
| 400 |
-
df_view['
|
| 401 |
-
lambda row: f"{row['
|
| 402 |
axis=1
|
| 403 |
)
|
| 404 |
|
|
@@ -416,7 +413,7 @@ def create_leaderboard_display(
|
|
| 416 |
for col in df_headers:
|
| 417 |
if col == "Logs" or "Cost" in col or "Score" in col:
|
| 418 |
df_datatypes.append("markdown")
|
| 419 |
-
elif col in ["
|
| 420 |
df_datatypes.append("html")
|
| 421 |
else:
|
| 422 |
df_datatypes.append("str")
|
|
@@ -508,7 +505,7 @@ def create_benchmark_details_display(
|
|
| 508 |
benchmark_cost_col = f"{benchmark_name} Cost"
|
| 509 |
|
| 510 |
# Define the columns needed for the detailed table
|
| 511 |
-
table_cols = ['
|
| 512 |
|
| 513 |
# Filter to only columns that actually exist in the full dataframe
|
| 514 |
existing_table_cols = [col for col in table_cols if col in full_df.columns]
|
|
@@ -540,13 +537,13 @@ def create_benchmark_details_display(
|
|
| 540 |
|
| 541 |
benchmark_table_df['Icon'] = benchmark_table_df.apply(get_openness_icon_html, axis=1)
|
| 542 |
|
| 543 |
-
#Make pretty and format the
|
| 544 |
-
benchmark_table_df['
|
| 545 |
-
benchmark_table_df['
|
| 546 |
-
# append the repro url to the end of the
|
| 547 |
if 'Source' in benchmark_table_df.columns:
|
| 548 |
-
benchmark_table_df['
|
| 549 |
-
lambda row: f"{row['
|
| 550 |
axis=1
|
| 551 |
)
|
| 552 |
|
|
@@ -573,9 +570,8 @@ def create_benchmark_details_display(
|
|
| 573 |
desired_cols_in_order = [
|
| 574 |
'Pareto',
|
| 575 |
'Icon',
|
| 576 |
-
'
|
| 577 |
-
'
|
| 578 |
-
'Models Used',
|
| 579 |
'Attempted Benchmark',
|
| 580 |
benchmark_score_col,
|
| 581 |
benchmark_cost_col,
|
|
@@ -597,7 +593,7 @@ def create_benchmark_details_display(
|
|
| 597 |
for col in df_headers:
|
| 598 |
if "Logs" in col or "Cost" in col or "Score" in col:
|
| 599 |
df_datatypes.append("markdown")
|
| 600 |
-
elif col in ["
|
| 601 |
df_datatypes.append("html")
|
| 602 |
else:
|
| 603 |
df_datatypes.append("str")
|
|
|
|
| 147 |
"""Generates the inner HTML for the Column Descriptions tooltip card depending on which kind of table."""
|
| 148 |
if table == "Overall":
|
| 149 |
return """
|
| 150 |
+
<div class="tooltip-description-item"><b>OpenHands Version:</b> Version of the OpenHands agent evaluated.</div>
|
| 151 |
+
<div class="tooltip-description-item"><b>Language Model:</b> Language model(s) used by the agent. Hover over ⓘ to view all.</div>
|
|
|
|
| 152 |
<div class="tooltip-description-item"><b>Overall Score:</b> Macro-average of the four category-level average scores. Each category contributes equally.</div>
|
| 153 |
<div class="tooltip-description-item"><b>Overall Cost:</b> Macro-average cost per problem across all categories, in USD. Each category contributes equally.</div>
|
| 154 |
<div class="tooltip-description-item"><b>Literature Understanding Score:</b> Macro-average score across Literature Understanding benchmarks.</div>
|
|
|
|
| 164 |
"""
|
| 165 |
elif table in ["Literature Understanding", "Code & Execution", "Data Analysis", "End-to-End Discovery"]:
|
| 166 |
return f"""
|
| 167 |
+
<div class="tooltip-description-item"><b>OpenHands Version:</b> Version of the OpenHands agent evaluated.</div>
|
| 168 |
+
<div class="tooltip-description-item"><b>Language Model:</b> Language model(s) used by the agent. Hover over ⓘ to view all.</div>
|
|
|
|
| 169 |
<div class="tooltip-description-item"><b>{table} Score:</b> Macro-average score across {table} benchmarks.</div>
|
| 170 |
<div class="tooltip-description-item"><b>{table} Cost:</b> Macro-average cost per problem (USD) across {table} benchmarks.</div>
|
| 171 |
<div class="tooltip-description-item"><b>Benchmark Score:</b> Average (mean) score on the benchmark.</div>
|
|
|
|
| 176 |
else:
|
| 177 |
# Fallback for any other table type, e.g., individual benchmarks
|
| 178 |
return f"""
|
| 179 |
+
<div class="tooltip-description-item"><b>OpenHands Version:</b> Version of the OpenHands agent evaluated.</div>
|
| 180 |
+
<div class="tooltip-description-item"><b>Language Model:</b> Language model(s) used by the agent. Hover over ⓘ to view all.</div>
|
|
|
|
| 181 |
<div class="tooltip-description-item"><b>Benchmark Attempted:</b> Indicates whether the agent attempted this benchmark.</div>
|
| 182 |
<div class="tooltip-description-item"><b>{table} Score:</b> Score achieved by the agent on this benchmark.</div>
|
| 183 |
<div class="tooltip-description-item"><b>{table} Cost:</b> Cost incurred by the agent to solve this benchmark (in USD).</div>
|
|
|
|
| 389 |
if "Score" in col:
|
| 390 |
df_view = format_score_column(df_view, col)
|
| 391 |
scatter_plot = plots_dict.get('scatter_plot', go.Figure())
|
| 392 |
+
#Make pretty and format the Language Model column
|
| 393 |
+
df_view['Language Model'] = df_view['Language Model'].apply(clean_llm_base_list)
|
| 394 |
+
df_view['Language Model'] = df_view['Language Model'].apply(format_llm_base_with_html)
|
| 395 |
+
# append the repro url to the end of the OpenHands Version
|
| 396 |
if 'Source' in df_view.columns:
|
| 397 |
+
df_view['OpenHands Version'] = df_view.apply(
|
| 398 |
+
lambda row: f"{row['OpenHands Version']} {row['Source']}" if pd.notna(row['Source']) and row['Source'] else row['OpenHands Version'],
|
| 399 |
axis=1
|
| 400 |
)
|
| 401 |
|
|
|
|
| 413 |
for col in df_headers:
|
| 414 |
if col == "Logs" or "Cost" in col or "Score" in col:
|
| 415 |
df_datatypes.append("markdown")
|
| 416 |
+
elif col in ["OpenHands Version","Icon","Language Model", "Pareto"]:
|
| 417 |
df_datatypes.append("html")
|
| 418 |
else:
|
| 419 |
df_datatypes.append("str")
|
|
|
|
| 505 |
benchmark_cost_col = f"{benchmark_name} Cost"
|
| 506 |
|
| 507 |
# Define the columns needed for the detailed table
|
| 508 |
+
table_cols = ['OpenHands Version','Source','Openness', 'Date', benchmark_score_col, benchmark_cost_col,'Logs','id', 'Language Model']
|
| 509 |
|
| 510 |
# Filter to only columns that actually exist in the full dataframe
|
| 511 |
existing_table_cols = [col for col in table_cols if col in full_df.columns]
|
|
|
|
| 537 |
|
| 538 |
benchmark_table_df['Icon'] = benchmark_table_df.apply(get_openness_icon_html, axis=1)
|
| 539 |
|
| 540 |
+
#Make pretty and format the Language Model column
|
| 541 |
+
benchmark_table_df['Language Model'] = benchmark_table_df['Language Model'].apply(clean_llm_base_list)
|
| 542 |
+
benchmark_table_df['Language Model'] = benchmark_table_df['Language Model'].apply(format_llm_base_with_html)
|
| 543 |
+
# append the repro url to the end of the OpenHands Version
|
| 544 |
if 'Source' in benchmark_table_df.columns:
|
| 545 |
+
benchmark_table_df['OpenHands Version'] = benchmark_table_df.apply(
|
| 546 |
+
lambda row: f"{row['OpenHands Version']} {row['Source']}" if row['Source'] else row['OpenHands Version'],
|
| 547 |
axis=1
|
| 548 |
)
|
| 549 |
|
|
|
|
| 570 |
desired_cols_in_order = [
|
| 571 |
'Pareto',
|
| 572 |
'Icon',
|
| 573 |
+
'OpenHands Version',
|
| 574 |
+
'Language Model',
|
|
|
|
| 575 |
'Attempted Benchmark',
|
| 576 |
benchmark_score_col,
|
| 577 |
benchmark_cost_col,
|
|
|
|
| 593 |
for col in df_headers:
|
| 594 |
if "Logs" in col or "Cost" in col or "Score" in col:
|
| 595 |
df_datatypes.append("markdown")
|
| 596 |
+
elif col in ["OpenHands Version", "Icon", "Language Model", "Pareto"]:
|
| 597 |
df_datatypes.append("html")
|
| 598 |
else:
|
| 599 |
df_datatypes.append("str")
|