Spaces:
Running
Simplify leaderboard to open vs closed models only
Browse filesRemove agent tooling distinction from the leaderboard:
- Simplified openness from 4 categories to 2: 'open' and 'closed'
- Removed all 'tool_usage' references from data structures
- Updated UI to remove Agent Tooling column and icons
- Simplified scatter plots to use only 2 colors (open/closed)
- Updated legends to show Model Openness instead of Agent Openness/Tooling
- Regenerated mock data without tool_usage field
- Added openness normalization to handle legacy values
Files modified:
- aliases.py: Simplified openness constants, removed tool usage
- simple_data_loader.py: Added openness normalization, removed tool_usage loading
- leaderboard_transformer.py: Removed Agent Tooling column, simplified color mapping
- ui_components.py: Removed tooling icons/tooltips, simplified to openness only
- generate_mock_jsonl.py: Removed tool_usage from mock data generation
- aliases.py +14 -18
- generate_mock_jsonl.py +5 -11
- leaderboard_transformer.py +13 -28
- mock_results/1.0.0-dev1/commit0.jsonl +5 -5
- mock_results/1.0.0-dev1/gaia.jsonl +5 -5
- mock_results/1.0.0-dev1/multi-swe-bench.jsonl +5 -5
- mock_results/1.0.0-dev1/swe-bench-multimodal.jsonl +5 -5
- mock_results/1.0.0-dev1/swe-bench.jsonl +5 -5
- mock_results/1.0.0-dev1/swt-bench.jsonl +5 -5
- simple_data_loader.py +7 -3
- ui_components.py +26 -183
|
@@ -1,22 +1,18 @@
|
|
| 1 |
-
# Define constants
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
CANONICAL_OPENNESS_CLOSED_API_AVAILABLE = "closed_api_available"
|
| 5 |
-
CANONICAL_OPENNESS_CLOSED_UI_ONLY = "closed_ui_only"
|
| 6 |
-
CANONICAL_TOOL_USAGE_STANDARD = "standard"
|
| 7 |
-
CANONICAL_TOOL_USAGE_CUSTOM_INTERFACE = "custom_interface"
|
| 8 |
-
CANONICAL_TOOL_USAGE_FULLY_CUSTOM = "fully_custom"
|
| 9 |
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
|
|
|
|
|
|
| 16 |
}
|
| 17 |
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
CANONICAL_TOOL_USAGE_FULLY_CUSTOM: {"Fully Custom"}
|
| 22 |
}
|
|
|
|
| 1 |
+
# Define constants for openness categories
|
| 2 |
+
CANONICAL_OPENNESS_OPEN = "open"
|
| 3 |
+
CANONICAL_OPENNESS_CLOSED = "closed"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
|
| 5 |
+
# Map old openness values to new simplified values
|
| 6 |
+
OPENNESS_MAPPING = {
|
| 7 |
+
"open_source_open_weights": CANONICAL_OPENNESS_OPEN,
|
| 8 |
+
"open_source_closed_weights": CANONICAL_OPENNESS_OPEN,
|
| 9 |
+
"closed_api_available": CANONICAL_OPENNESS_CLOSED,
|
| 10 |
+
"closed_ui_only": CANONICAL_OPENNESS_CLOSED,
|
| 11 |
+
"open": CANONICAL_OPENNESS_OPEN,
|
| 12 |
+
"closed": CANONICAL_OPENNESS_CLOSED,
|
| 13 |
}
|
| 14 |
|
| 15 |
+
OPENNESS_ALIASES = {
|
| 16 |
+
CANONICAL_OPENNESS_OPEN: {"Open", "Open Source", "Open Source + Open Weights"},
|
| 17 |
+
CANONICAL_OPENNESS_CLOSED: {"Closed", "API Available"}
|
|
|
|
| 18 |
}
|
|
@@ -43,8 +43,7 @@ MOCK_AGENTS = [
|
|
| 43 |
{
|
| 44 |
"agent_name": "1.0.2",
|
| 45 |
"llm_base": "claude-3-5-sonnet-20241022",
|
| 46 |
-
"openness": "
|
| 47 |
-
"tool_usage": "standard",
|
| 48 |
"scores": {
|
| 49 |
"swe-bench": 48.3,
|
| 50 |
"multi-swe-bench": 35.2,
|
|
@@ -57,8 +56,7 @@ MOCK_AGENTS = [
|
|
| 57 |
{
|
| 58 |
"agent_name": "1.0.1",
|
| 59 |
"llm_base": "gpt-4o-2024-11-20",
|
| 60 |
-
"openness": "
|
| 61 |
-
"tool_usage": "standard",
|
| 62 |
"scores": {
|
| 63 |
"swe-bench": 45.1,
|
| 64 |
"multi-swe-bench": 32.8,
|
|
@@ -71,8 +69,7 @@ MOCK_AGENTS = [
|
|
| 71 |
{
|
| 72 |
"agent_name": "1.0.0",
|
| 73 |
"llm_base": "gpt-4-turbo-2024-04-09",
|
| 74 |
-
"openness": "
|
| 75 |
-
"tool_usage": "standard",
|
| 76 |
"scores": {
|
| 77 |
"swe-bench": 38.7,
|
| 78 |
"multi-swe-bench": 28.4,
|
|
@@ -85,8 +82,7 @@ MOCK_AGENTS = [
|
|
| 85 |
{
|
| 86 |
"agent_name": "0.9.5",
|
| 87 |
"llm_base": "gpt-4o-mini-2024-07-18",
|
| 88 |
-
"openness": "
|
| 89 |
-
"tool_usage": "standard",
|
| 90 |
"scores": {
|
| 91 |
"swe-bench": 32.5,
|
| 92 |
"multi-swe-bench": 24.1,
|
|
@@ -99,8 +95,7 @@ MOCK_AGENTS = [
|
|
| 99 |
{
|
| 100 |
"agent_name": "0.9.0",
|
| 101 |
"llm_base": "claude-3-opus-20240229",
|
| 102 |
-
"openness": "
|
| 103 |
-
"tool_usage": "custom_interface",
|
| 104 |
"scores": {
|
| 105 |
"swe-bench": 29.8,
|
| 106 |
"multi-swe-bench": 21.5,
|
|
@@ -148,7 +143,6 @@ def generate_mock_data():
|
|
| 148 |
"agent_name": agent["agent_name"],
|
| 149 |
"llm_base": agent["llm_base"],
|
| 150 |
"openness": agent["openness"],
|
| 151 |
-
"tool_usage": agent["tool_usage"],
|
| 152 |
"score": agent["scores"][benchmark_name],
|
| 153 |
"metric": benchmark_info["metric"],
|
| 154 |
"submission_time": datetime.now().isoformat(),
|
|
|
|
| 43 |
{
|
| 44 |
"agent_name": "1.0.2",
|
| 45 |
"llm_base": "claude-3-5-sonnet-20241022",
|
| 46 |
+
"openness": "closed",
|
|
|
|
| 47 |
"scores": {
|
| 48 |
"swe-bench": 48.3,
|
| 49 |
"multi-swe-bench": 35.2,
|
|
|
|
| 56 |
{
|
| 57 |
"agent_name": "1.0.1",
|
| 58 |
"llm_base": "gpt-4o-2024-11-20",
|
| 59 |
+
"openness": "closed",
|
|
|
|
| 60 |
"scores": {
|
| 61 |
"swe-bench": 45.1,
|
| 62 |
"multi-swe-bench": 32.8,
|
|
|
|
| 69 |
{
|
| 70 |
"agent_name": "1.0.0",
|
| 71 |
"llm_base": "gpt-4-turbo-2024-04-09",
|
| 72 |
+
"openness": "closed",
|
|
|
|
| 73 |
"scores": {
|
| 74 |
"swe-bench": 38.7,
|
| 75 |
"multi-swe-bench": 28.4,
|
|
|
|
| 82 |
{
|
| 83 |
"agent_name": "0.9.5",
|
| 84 |
"llm_base": "gpt-4o-mini-2024-07-18",
|
| 85 |
+
"openness": "closed",
|
|
|
|
| 86 |
"scores": {
|
| 87 |
"swe-bench": 32.5,
|
| 88 |
"multi-swe-bench": 24.1,
|
|
|
|
| 95 |
{
|
| 96 |
"agent_name": "0.9.0",
|
| 97 |
"llm_base": "claude-3-opus-20240229",
|
| 98 |
+
"openness": "closed",
|
|
|
|
| 99 |
"scores": {
|
| 100 |
"swe-bench": 29.8,
|
| 101 |
"multi-swe-bench": 21.5,
|
|
|
|
| 143 |
"agent_name": agent["agent_name"],
|
| 144 |
"llm_base": agent["llm_base"],
|
| 145 |
"openness": agent["openness"],
|
|
|
|
| 146 |
"score": agent["scores"][benchmark_name],
|
| 147 |
"metric": benchmark_info["metric"],
|
| 148 |
"submission_time": datetime.now().isoformat(),
|
|
@@ -110,7 +110,6 @@ def _pretty_column_name(raw_col: str) -> str:
|
|
| 110 |
'Overall cost': 'Overall Cost',
|
| 111 |
'Logs': 'Logs',
|
| 112 |
'Openness': 'Openness',
|
| 113 |
-
'Agent tooling': 'Agent Tooling',
|
| 114 |
'LLM base': 'Model',
|
| 115 |
'Source': 'Source',
|
| 116 |
}
|
|
@@ -255,9 +254,9 @@ class DataTransformer:
|
|
| 255 |
|
| 256 |
df_view = df_sorted.copy()
|
| 257 |
|
| 258 |
-
# --- 3. Add Columns for Agent Openness
|
| 259 |
base_cols = ["id","Agent","Submitter","Models Used","Source"]
|
| 260 |
-
new_cols = ["Openness"
|
| 261 |
ending_cols = ["Date", "Logs"]
|
| 262 |
|
| 263 |
metrics_to_display = [primary_score_col, f"{primary_metric} Cost"]
|
|
@@ -331,13 +330,10 @@ def _plot_scatter_plotly(
|
|
| 331 |
) -> go.Figure:
|
| 332 |
|
| 333 |
# --- Section 1: Define Mappings ---
|
| 334 |
-
#
|
| 335 |
-
# so multiple names might correspond to the same color.
|
| 336 |
color_map = {
|
| 337 |
-
aliases.
|
| 338 |
-
aliases.
|
| 339 |
-
aliases.CANONICAL_OPENNESS_CLOSED_API_AVAILABLE: "yellow",
|
| 340 |
-
aliases.CANONICAL_OPENNESS_CLOSED_UI_ONLY: "white",
|
| 341 |
}
|
| 342 |
for canonical_openness, openness_aliases in aliases.OPENNESS_ALIASES.items():
|
| 343 |
for openness_alias in openness_aliases:
|
|
@@ -346,26 +342,15 @@ def _plot_scatter_plotly(
|
|
| 346 |
colors_for_legend = set(aliases.OPENNESS_ALIASES.keys())
|
| 347 |
category_order = list(color_map.keys())
|
| 348 |
|
| 349 |
-
#
|
| 350 |
-
|
| 351 |
-
shape_map = {
|
| 352 |
-
aliases.CANONICAL_TOOL_USAGE_STANDARD: "star",
|
| 353 |
-
aliases.CANONICAL_TOOL_USAGE_CUSTOM_INTERFACE: "star-diamond",
|
| 354 |
-
aliases.CANONICAL_TOOL_USAGE_FULLY_CUSTOM: "star-triangle-up",
|
| 355 |
-
}
|
| 356 |
-
for canonical_tool_usage, tool_usages_aliases in aliases.TOOL_USAGE_ALIASES.items():
|
| 357 |
-
for tool_usage_alias in tool_usages_aliases:
|
| 358 |
-
shape_map[tool_usage_alias] = shape_map[canonical_tool_usage]
|
| 359 |
-
default_shape = 'square'
|
| 360 |
-
# Only keep one name per shape for the legend.
|
| 361 |
-
shapes_for_legend = set(aliases.TOOL_USAGE_ALIASES.keys())
|
| 362 |
|
| 363 |
x_col_to_use = x
|
| 364 |
y_col_to_use = y
|
| 365 |
llm_base = data["Models Used"] if "Models Used" in data.columns else "Models Used"
|
| 366 |
|
| 367 |
# --- Section 2: Data Preparation---
|
| 368 |
-
required_cols = [y_col_to_use, agent_col, "Openness"
|
| 369 |
if not all(col in data.columns for col in required_cols):
|
| 370 |
logger.error(f"Missing one or more required columns for plotting: {required_cols}")
|
| 371 |
return go.Figure()
|
|
@@ -411,7 +396,7 @@ def _plot_scatter_plotly(
|
|
| 411 |
data_plot[x_col_to_use] = 0
|
| 412 |
|
| 413 |
# Clean data based on all necessary columns
|
| 414 |
-
data_plot.dropna(subset=[y_col_to_use, x_col_to_use, "Openness"
|
| 415 |
|
| 416 |
# --- Section 3: Initialize Figure ---
|
| 417 |
fig = go.Figure()
|
|
@@ -458,8 +443,7 @@ def _plot_scatter_plotly(
|
|
| 458 |
parts.append(f"{h_pad}{x_axis_label}: <b>Missing</b>{h_pad}<br>")
|
| 459 |
else:
|
| 460 |
parts.append(f"{h_pad}{x_axis_label}: <b>${row[x_col]:.2f}</b>{h_pad}<br>")
|
| 461 |
-
parts.append(f"{h_pad}Openness: <b>{row['Openness']}</b>{h_pad}
|
| 462 |
-
parts.append(f"{h_pad}Tooling: <b>{row['Agent Tooling']}</b>{h_pad}")
|
| 463 |
|
| 464 |
# Add extra vertical space (line spacing) before the next section
|
| 465 |
parts.append("<br>")
|
|
@@ -491,7 +475,8 @@ def _plot_scatter_plotly(
|
|
| 491 |
),
|
| 492 |
axis=1
|
| 493 |
)
|
| 494 |
-
|
|
|
|
| 495 |
|
| 496 |
# --- Section 6: Plot Markers by "Openness" Category ---
|
| 497 |
for category in category_order:
|
|
@@ -509,7 +494,7 @@ def _plot_scatter_plotly(
|
|
| 509 |
hoverinfo='text',
|
| 510 |
marker=dict(
|
| 511 |
color=color_map.get(category, 'black'),
|
| 512 |
-
symbol=
|
| 513 |
size=15,
|
| 514 |
opacity=0.8,
|
| 515 |
line=dict(width=1, color='deeppink')
|
|
|
|
| 110 |
'Overall cost': 'Overall Cost',
|
| 111 |
'Logs': 'Logs',
|
| 112 |
'Openness': 'Openness',
|
|
|
|
| 113 |
'LLM base': 'Model',
|
| 114 |
'Source': 'Source',
|
| 115 |
}
|
|
|
|
| 254 |
|
| 255 |
df_view = df_sorted.copy()
|
| 256 |
|
| 257 |
+
# --- 3. Add Columns for Agent Openness ---
|
| 258 |
base_cols = ["id","Agent","Submitter","Models Used","Source"]
|
| 259 |
+
new_cols = ["Openness"]
|
| 260 |
ending_cols = ["Date", "Logs"]
|
| 261 |
|
| 262 |
metrics_to_display = [primary_score_col, f"{primary_metric} Cost"]
|
|
|
|
| 330 |
) -> go.Figure:
|
| 331 |
|
| 332 |
# --- Section 1: Define Mappings ---
|
| 333 |
+
# Map openness to colors (simplified: open vs closed)
|
|
|
|
| 334 |
color_map = {
|
| 335 |
+
aliases.CANONICAL_OPENNESS_OPEN: "deeppink",
|
| 336 |
+
aliases.CANONICAL_OPENNESS_CLOSED: "yellow",
|
|
|
|
|
|
|
| 337 |
}
|
| 338 |
for canonical_openness, openness_aliases in aliases.OPENNESS_ALIASES.items():
|
| 339 |
for openness_alias in openness_aliases:
|
|
|
|
| 342 |
colors_for_legend = set(aliases.OPENNESS_ALIASES.keys())
|
| 343 |
category_order = list(color_map.keys())
|
| 344 |
|
| 345 |
+
# Use consistent marker shape (no tooling distinction)
|
| 346 |
+
default_shape = 'circle'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 347 |
|
| 348 |
x_col_to_use = x
|
| 349 |
y_col_to_use = y
|
| 350 |
llm_base = data["Models Used"] if "Models Used" in data.columns else "Models Used"
|
| 351 |
|
| 352 |
# --- Section 2: Data Preparation---
|
| 353 |
+
required_cols = [y_col_to_use, agent_col, "Openness"]
|
| 354 |
if not all(col in data.columns for col in required_cols):
|
| 355 |
logger.error(f"Missing one or more required columns for plotting: {required_cols}")
|
| 356 |
return go.Figure()
|
|
|
|
| 396 |
data_plot[x_col_to_use] = 0
|
| 397 |
|
| 398 |
# Clean data based on all necessary columns
|
| 399 |
+
data_plot.dropna(subset=[y_col_to_use, x_col_to_use, "Openness"], inplace=True)
|
| 400 |
|
| 401 |
# --- Section 3: Initialize Figure ---
|
| 402 |
fig = go.Figure()
|
|
|
|
| 443 |
parts.append(f"{h_pad}{x_axis_label}: <b>Missing</b>{h_pad}<br>")
|
| 444 |
else:
|
| 445 |
parts.append(f"{h_pad}{x_axis_label}: <b>${row[x_col]:.2f}</b>{h_pad}<br>")
|
| 446 |
+
parts.append(f"{h_pad}Openness: <b>{row['Openness']}</b>{h_pad}")
|
|
|
|
| 447 |
|
| 448 |
# Add extra vertical space (line spacing) before the next section
|
| 449 |
parts.append("<br>")
|
|
|
|
| 475 |
),
|
| 476 |
axis=1
|
| 477 |
)
|
| 478 |
+
# Use consistent shape for all points (no tooling distinction)
|
| 479 |
+
data_plot['shape_symbol'] = default_shape
|
| 480 |
|
| 481 |
# --- Section 6: Plot Markers by "Openness" Category ---
|
| 482 |
for category in category_order:
|
|
|
|
| 494 |
hoverinfo='text',
|
| 495 |
marker=dict(
|
| 496 |
color=color_map.get(category, 'black'),
|
| 497 |
+
symbol=default_shape,
|
| 498 |
size=15,
|
| 499 |
opacity=0.8,
|
| 500 |
line=dict(width=1, color='deeppink')
|
|
@@ -1,5 +1,5 @@
|
|
| 1 |
-
{"agent_name": "1.0.2", "llm_base": "claude-3-5-sonnet-20241022", "openness": "
|
| 2 |
-
{"agent_name": "1.0.1", "llm_base": "gpt-4o-2024-11-20", "openness": "
|
| 3 |
-
{"agent_name": "1.0.0", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "
|
| 4 |
-
{"agent_name": "0.9.5", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "
|
| 5 |
-
{"agent_name": "0.9.0", "llm_base": "claude-3-opus-20240229", "openness": "
|
|
|
|
| 1 |
+
{"agent_name": "1.0.2", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed", "score": 71.2, "metric": "test_pass_rate", "submission_time": "2025-11-25T00:12:22.972910", "tags": ["commit0"], "total_cost": 45.6, "total_runtime": 656.0}
|
| 2 |
+
{"agent_name": "1.0.1", "llm_base": "gpt-4o-2024-11-20", "openness": "closed", "score": 68.9, "metric": "test_pass_rate", "submission_time": "2025-11-25T00:12:22.972929", "tags": ["commit0"], "total_cost": 44.45, "total_runtime": 644.5}
|
| 3 |
+
{"agent_name": "1.0.0", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed", "score": 61.5, "metric": "test_pass_rate", "submission_time": "2025-11-25T00:12:22.972939", "tags": ["commit0"], "total_cost": 40.75, "total_runtime": 607.5}
|
| 4 |
+
{"agent_name": "0.9.5", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed", "score": 55.3, "metric": "test_pass_rate", "submission_time": "2025-11-25T00:12:22.972947", "tags": ["commit0"], "total_cost": 37.65, "total_runtime": 576.5}
|
| 5 |
+
{"agent_name": "0.9.0", "llm_base": "claude-3-opus-20240229", "openness": "closed", "score": 52.1, "metric": "test_pass_rate", "submission_time": "2025-11-25T00:12:22.972954", "tags": ["commit0"], "total_cost": 36.05, "total_runtime": 560.5}
|
|
@@ -1,5 +1,5 @@
|
|
| 1 |
-
{"agent_name": "1.0.2", "llm_base": "claude-3-5-sonnet-20241022", "openness": "
|
| 2 |
-
{"agent_name": "1.0.1", "llm_base": "gpt-4o-2024-11-20", "openness": "
|
| 3 |
-
{"agent_name": "1.0.0", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "
|
| 4 |
-
{"agent_name": "0.9.5", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "
|
| 5 |
-
{"agent_name": "0.9.0", "llm_base": "claude-3-opus-20240229", "openness": "
|
|
|
|
| 1 |
+
{"agent_name": "1.0.2", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed", "score": 58.7, "metric": "accuracy", "submission_time": "2025-11-25T00:12:22.973093", "tags": ["gaia"], "total_cost": 39.35, "total_runtime": 593.5}
|
| 2 |
+
{"agent_name": "1.0.1", "llm_base": "gpt-4o-2024-11-20", "openness": "closed", "score": 55.2, "metric": "accuracy", "submission_time": "2025-11-25T00:12:22.973111", "tags": ["gaia"], "total_cost": 37.6, "total_runtime": 576.0}
|
| 3 |
+
{"agent_name": "1.0.0", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed", "score": 48.3, "metric": "accuracy", "submission_time": "2025-11-25T00:12:22.973121", "tags": ["gaia"], "total_cost": 34.15, "total_runtime": 541.5}
|
| 4 |
+
{"agent_name": "0.9.5", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed", "score": 42.1, "metric": "accuracy", "submission_time": "2025-11-25T00:12:22.973129", "tags": ["gaia"], "total_cost": 31.05, "total_runtime": 510.5}
|
| 5 |
+
{"agent_name": "0.9.0", "llm_base": "claude-3-opus-20240229", "openness": "closed", "score": 39.4, "metric": "accuracy", "submission_time": "2025-11-25T00:12:22.973137", "tags": ["gaia"], "total_cost": 29.7, "total_runtime": 497.0}
|
|
@@ -1,5 +1,5 @@
|
|
| 1 |
-
{"agent_name": "1.0.2", "llm_base": "claude-3-5-sonnet-20241022", "openness": "
|
| 2 |
-
{"agent_name": "1.0.1", "llm_base": "gpt-4o-2024-11-20", "openness": "
|
| 3 |
-
{"agent_name": "1.0.0", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "
|
| 4 |
-
{"agent_name": "0.9.5", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "
|
| 5 |
-
{"agent_name": "0.9.0", "llm_base": "claude-3-opus-20240229", "openness": "
|
|
|
|
| 1 |
+
{"agent_name": "1.0.2", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed", "score": 35.2, "metric": "resolve_rate", "submission_time": "2025-11-25T00:12:22.972368", "tags": ["multi-swe-bench"], "total_cost": 27.6, "total_runtime": 476.0}
|
| 2 |
+
{"agent_name": "1.0.1", "llm_base": "gpt-4o-2024-11-20", "openness": "closed", "score": 32.8, "metric": "resolve_rate", "submission_time": "2025-11-25T00:12:22.972389", "tags": ["multi-swe-bench"], "total_cost": 26.4, "total_runtime": 464.0}
|
| 3 |
+
{"agent_name": "1.0.0", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed", "score": 28.4, "metric": "resolve_rate", "submission_time": "2025-11-25T00:12:22.972400", "tags": ["multi-swe-bench"], "total_cost": 24.2, "total_runtime": 442.0}
|
| 4 |
+
{"agent_name": "0.9.5", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed", "score": 24.1, "metric": "resolve_rate", "submission_time": "2025-11-25T00:12:22.972408", "tags": ["multi-swe-bench"], "total_cost": 22.05, "total_runtime": 420.5}
|
| 5 |
+
{"agent_name": "0.9.0", "llm_base": "claude-3-opus-20240229", "openness": "closed", "score": 21.5, "metric": "resolve_rate", "submission_time": "2025-11-25T00:12:22.972416", "tags": ["multi-swe-bench"], "total_cost": 20.75, "total_runtime": 407.5}
|
|
@@ -1,5 +1,5 @@
|
|
| 1 |
-
{"agent_name": "1.0.2", "llm_base": "claude-3-5-sonnet-20241022", "openness": "
|
| 2 |
-
{"agent_name": "1.0.1", "llm_base": "gpt-4o-2024-11-20", "openness": "
|
| 3 |
-
{"agent_name": "1.0.0", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "
|
| 4 |
-
{"agent_name": "0.9.5", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "
|
| 5 |
-
{"agent_name": "0.9.0", "llm_base": "claude-3-opus-20240229", "openness": "
|
|
|
|
| 1 |
+
{"agent_name": "1.0.2", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed", "score": 42.1, "metric": "resolve_rate", "submission_time": "2025-11-25T00:12:22.972550", "tags": ["swe-bench-multimodal"], "total_cost": 31.05, "total_runtime": 510.5}
|
| 2 |
+
{"agent_name": "1.0.1", "llm_base": "gpt-4o-2024-11-20", "openness": "closed", "score": 39.5, "metric": "resolve_rate", "submission_time": "2025-11-25T00:12:22.972567", "tags": ["swe-bench-multimodal"], "total_cost": 29.75, "total_runtime": 497.5}
|
| 3 |
+
{"agent_name": "1.0.0", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed", "score": 34.2, "metric": "resolve_rate", "submission_time": "2025-11-25T00:12:22.972577", "tags": ["swe-bench-multimodal"], "total_cost": 27.1, "total_runtime": 471.0}
|
| 4 |
+
{"agent_name": "0.9.5", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed", "score": 28.9, "metric": "resolve_rate", "submission_time": "2025-11-25T00:12:22.972585", "tags": ["swe-bench-multimodal"], "total_cost": 24.45, "total_runtime": 444.5}
|
| 5 |
+
{"agent_name": "0.9.0", "llm_base": "claude-3-opus-20240229", "openness": "closed", "score": 25.7, "metric": "resolve_rate", "submission_time": "2025-11-25T00:12:22.972593", "tags": ["swe-bench-multimodal"], "total_cost": 22.85, "total_runtime": 428.5}
|
|
@@ -1,5 +1,5 @@
|
|
| 1 |
-
{"agent_name": "1.0.2", "llm_base": "claude-3-5-sonnet-20241022", "openness": "
|
| 2 |
-
{"agent_name": "1.0.1", "llm_base": "gpt-4o-2024-11-20", "openness": "
|
| 3 |
-
{"agent_name": "1.0.0", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "
|
| 4 |
-
{"agent_name": "0.9.5", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "
|
| 5 |
-
{"agent_name": "0.9.0", "llm_base": "claude-3-opus-20240229", "openness": "
|
|
|
|
| 1 |
+
{"agent_name": "1.0.2", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed", "score": 48.3, "metric": "resolve_rate", "submission_time": "2025-11-25T00:12:22.972101", "tags": ["swe-bench"], "total_cost": 34.15, "total_runtime": 541.5}
|
| 2 |
+
{"agent_name": "1.0.1", "llm_base": "gpt-4o-2024-11-20", "openness": "closed", "score": 45.1, "metric": "resolve_rate", "submission_time": "2025-11-25T00:12:22.972136", "tags": ["swe-bench"], "total_cost": 32.55, "total_runtime": 525.5}
|
| 3 |
+
{"agent_name": "1.0.0", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed", "score": 38.7, "metric": "resolve_rate", "submission_time": "2025-11-25T00:12:22.972167", "tags": ["swe-bench"], "total_cost": 29.35, "total_runtime": 493.5}
|
| 4 |
+
{"agent_name": "0.9.5", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed", "score": 32.5, "metric": "resolve_rate", "submission_time": "2025-11-25T00:12:22.972178", "tags": ["swe-bench"], "total_cost": 26.25, "total_runtime": 462.5}
|
| 5 |
+
{"agent_name": "0.9.0", "llm_base": "claude-3-opus-20240229", "openness": "closed", "score": 29.8, "metric": "resolve_rate", "submission_time": "2025-11-25T00:12:22.972186", "tags": ["swe-bench"], "total_cost": 24.9, "total_runtime": 449.0}
|
|
@@ -1,5 +1,5 @@
|
|
| 1 |
-
{"agent_name": "1.0.2", "llm_base": "claude-3-5-sonnet-20241022", "openness": "
|
| 2 |
-
{"agent_name": "1.0.1", "llm_base": "gpt-4o-2024-11-20", "openness": "
|
| 3 |
-
{"agent_name": "1.0.0", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "
|
| 4 |
-
{"agent_name": "0.9.5", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "
|
| 5 |
-
{"agent_name": "0.9.0", "llm_base": "claude-3-opus-20240229", "openness": "
|
|
|
|
| 1 |
+
{"agent_name": "1.0.2", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed", "score": 65.4, "metric": "success_rate", "submission_time": "2025-11-25T00:12:22.972724", "tags": ["swt-bench"], "total_cost": 42.7, "total_runtime": 627.0}
|
| 2 |
+
{"agent_name": "1.0.1", "llm_base": "gpt-4o-2024-11-20", "openness": "closed", "score": 62.3, "metric": "success_rate", "submission_time": "2025-11-25T00:12:22.972741", "tags": ["swt-bench"], "total_cost": 41.15, "total_runtime": 611.5}
|
| 3 |
+
{"agent_name": "1.0.0", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed", "score": 54.1, "metric": "success_rate", "submission_time": "2025-11-25T00:12:22.972750", "tags": ["swt-bench"], "total_cost": 37.05, "total_runtime": 570.5}
|
| 4 |
+
{"agent_name": "0.9.5", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed", "score": 47.8, "metric": "success_rate", "submission_time": "2025-11-25T00:12:22.972758", "tags": ["swt-bench"], "total_cost": 33.9, "total_runtime": 539.0}
|
| 5 |
+
{"agent_name": "0.9.0", "llm_base": "claude-3-opus-20240229", "openness": "closed", "score": 44.2, "metric": "success_rate", "submission_time": "2025-11-25T00:12:22.972765", "tags": ["swt-bench"], "total_cost": 32.1, "total_runtime": 521.0}
|
|
@@ -88,7 +88,6 @@ class SimpleLeaderboardViewer:
|
|
| 88 |
'agent_name': metadata.get('agent_name', 'Unknown'),
|
| 89 |
'llm_base': metadata.get('model', 'unknown'),
|
| 90 |
'openness': metadata.get('openness', 'unknown'),
|
| 91 |
-
'tool_usage': metadata.get('tool_usage', 'standard'),
|
| 92 |
'submission_time': metadata.get('submission_time', ''),
|
| 93 |
'score': score_entry.get('score'),
|
| 94 |
'metric': score_entry.get('metric', 'unknown'),
|
|
@@ -152,12 +151,17 @@ class SimpleLeaderboardViewer:
|
|
| 152 |
|
| 153 |
# Build a single record for this agent
|
| 154 |
first_record = agent_records.iloc[0]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 155 |
record = {
|
| 156 |
# Core agent info - use final display names
|
| 157 |
'agent': agent_name, # Will become "Agent Version" after prettifying
|
| 158 |
'models used': first_record['llm_base'], # Will become "Model"
|
| 159 |
-
'openness':
|
| 160 |
-
'agent tooling': first_record['tool_usage'], # Will become "Agent Tooling"
|
| 161 |
'date': first_record['submission_time'], # Will become "Date"
|
| 162 |
# Additional columns expected by the transformer
|
| 163 |
'id': first_record.get('id', agent_name), # Will become "Id"
|
|
|
|
| 88 |
'agent_name': metadata.get('agent_name', 'Unknown'),
|
| 89 |
'llm_base': metadata.get('model', 'unknown'),
|
| 90 |
'openness': metadata.get('openness', 'unknown'),
|
|
|
|
| 91 |
'submission_time': metadata.get('submission_time', ''),
|
| 92 |
'score': score_entry.get('score'),
|
| 93 |
'metric': score_entry.get('metric', 'unknown'),
|
|
|
|
| 151 |
|
| 152 |
# Build a single record for this agent
|
| 153 |
first_record = agent_records.iloc[0]
|
| 154 |
+
|
| 155 |
+
# Normalize openness to "open" or "closed"
|
| 156 |
+
from aliases import OPENNESS_MAPPING
|
| 157 |
+
raw_openness = first_record['openness']
|
| 158 |
+
normalized_openness = OPENNESS_MAPPING.get(raw_openness, raw_openness)
|
| 159 |
+
|
| 160 |
record = {
|
| 161 |
# Core agent info - use final display names
|
| 162 |
'agent': agent_name, # Will become "Agent Version" after prettifying
|
| 163 |
'models used': first_record['llm_base'], # Will become "Model"
|
| 164 |
+
'openness': normalized_openness, # Will become "Openness" (simplified to "open" or "closed")
|
|
|
|
| 165 |
'date': first_record['submission_time'], # Will become "Date"
|
| 166 |
# Additional columns expected by the transformer
|
| 167 |
'id': first_record.get('id', agent_name), # Will become "Id"
|
|
@@ -36,73 +36,27 @@ from content import (
|
|
| 36 |
|
| 37 |
api = HfApi()
|
| 38 |
os.makedirs(EXTRACTED_DATA_DIR, exist_ok=True)
|
| 39 |
-
#
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
aliases.CANONICAL_TOOL_USAGE_FULLY_CUSTOM: "assets/os-ow-custom.svg",
|
| 45 |
-
},
|
| 46 |
-
aliases.CANONICAL_OPENNESS_OPEN_SOURCE_CLOSED_WEIGHTS: {
|
| 47 |
-
aliases.CANONICAL_TOOL_USAGE_STANDARD: "assets/os-standard.svg",
|
| 48 |
-
aliases.CANONICAL_TOOL_USAGE_CUSTOM_INTERFACE: "assets/os-equivalent.svg",
|
| 49 |
-
aliases.CANONICAL_TOOL_USAGE_FULLY_CUSTOM: "assets/os-custom.svg",
|
| 50 |
-
},
|
| 51 |
-
aliases.CANONICAL_OPENNESS_CLOSED_API_AVAILABLE: {
|
| 52 |
-
aliases.CANONICAL_TOOL_USAGE_STANDARD: "assets/api-standard.svg",
|
| 53 |
-
aliases.CANONICAL_TOOL_USAGE_CUSTOM_INTERFACE: "assets/api-equivalent.svg",
|
| 54 |
-
aliases.CANONICAL_TOOL_USAGE_FULLY_CUSTOM: "assets/api-custom.svg",
|
| 55 |
-
},
|
| 56 |
-
aliases.CANONICAL_OPENNESS_CLOSED_UI_ONLY: {
|
| 57 |
-
aliases.CANONICAL_TOOL_USAGE_STANDARD: "assets/c-standard.svg",
|
| 58 |
-
aliases.CANONICAL_TOOL_USAGE_CUSTOM_INTERFACE: "assets/c-equivalent.svg",
|
| 59 |
-
aliases.CANONICAL_TOOL_USAGE_FULLY_CUSTOM: "assets/c-custom.svg",
|
| 60 |
-
}
|
| 61 |
}
|
| 62 |
|
| 63 |
-
|
| 64 |
-
# it's important to do the tool usage first here, so that when
|
| 65 |
-
# we do openness, the tool usage changes get picked up
|
| 66 |
-
for openness in COMBINED_ICON_MAP:
|
| 67 |
-
for canonical_tool_usage, tool_usage_aliases in aliases.TOOL_USAGE_ALIASES.items():
|
| 68 |
-
for tool_usage_alias in tool_usage_aliases:
|
| 69 |
-
COMBINED_ICON_MAP[openness][tool_usage_alias] = COMBINED_ICON_MAP[openness][canonical_tool_usage]
|
| 70 |
-
|
| 71 |
for canonical_openness, openness_aliases in aliases.OPENNESS_ALIASES.items():
|
| 72 |
for openness_alias in openness_aliases:
|
| 73 |
-
|
| 74 |
|
| 75 |
|
| 76 |
OPENNESS_SVG_MAP = {
|
| 77 |
-
aliases.
|
| 78 |
"path": "assets/ellipse-pink.svg",
|
| 79 |
-
"description": "
|
| 80 |
-
},
|
| 81 |
-
aliases.CANONICAL_OPENNESS_OPEN_SOURCE_CLOSED_WEIGHTS: {
|
| 82 |
-
"path": "assets/ellipse-coral.svg",
|
| 83 |
-
"description": "Code is open but uses closed-weight models"
|
| 84 |
},
|
| 85 |
-
aliases.
|
| 86 |
"path": "assets/ellipse-yellow.svg",
|
| 87 |
-
"description": "
|
| 88 |
-
},
|
| 89 |
-
aliases.CANONICAL_OPENNESS_CLOSED_UI_ONLY: {
|
| 90 |
-
"path": "assets/ellipse-white.svg",
|
| 91 |
-
"description": "No access to code or API; UI access only"
|
| 92 |
-
},
|
| 93 |
-
}
|
| 94 |
-
TOOLING_SVG_MAP = {
|
| 95 |
-
aliases.CANONICAL_TOOL_USAGE_STANDARD: {
|
| 96 |
-
"path": "assets/five-point-star.svg",
|
| 97 |
-
"description": "Uses only tools explicitly provided in state.tools"
|
| 98 |
-
},
|
| 99 |
-
aliases.CANONICAL_TOOL_USAGE_CUSTOM_INTERFACE: {
|
| 100 |
-
"path": "assets/four-point-star.svg",
|
| 101 |
-
"description": "Custom tools for accessing an equivalent underlying environment"
|
| 102 |
-
},
|
| 103 |
-
aliases.CANONICAL_TOOL_USAGE_FULLY_CUSTOM: {
|
| 104 |
-
"path": "assets/three-point-star.svg",
|
| 105 |
-
"description": f"Uses tools beyond constraints of {aliases.CANONICAL_TOOL_USAGE_STANDARD} or {aliases.CANONICAL_TOOL_USAGE_CUSTOM_INTERFACE}"
|
| 106 |
},
|
| 107 |
}
|
| 108 |
|
|
@@ -116,29 +70,6 @@ def get_svg_as_data_uri(path: str) -> str:
|
|
| 116 |
print(f"Warning: SVG file not found at {path}")
|
| 117 |
return ""
|
| 118 |
|
| 119 |
-
# Create a pre-loaded version of our map. This should be run ONCE when the app starts.
|
| 120 |
-
PRELOADED_URI_MAP = {
|
| 121 |
-
openness: {
|
| 122 |
-
tooling: get_svg_as_data_uri(path)
|
| 123 |
-
for tooling, path in tooling_map.items()
|
| 124 |
-
}
|
| 125 |
-
for openness, tooling_map in COMBINED_ICON_MAP.items()
|
| 126 |
-
}
|
| 127 |
-
|
| 128 |
-
def get_combined_icon_html(row, uri_map):
|
| 129 |
-
"""
|
| 130 |
-
Looks up the correct icon URI from the pre-loaded map based on the row's
|
| 131 |
-
'Openness' and 'Agent Tooling' values and returns an HTML <img> tag.
|
| 132 |
-
"""
|
| 133 |
-
openness_val = row['Openness']
|
| 134 |
-
tooling_val = row['Agent Tooling']
|
| 135 |
-
uri = uri_map.get(openness_val, {}).get(tooling_val, "")
|
| 136 |
-
# The tooltip will show the exact combination for clarity.
|
| 137 |
-
tooltip = f"Openness: {openness_val}, Tooling: {tooling_val}"
|
| 138 |
-
|
| 139 |
-
# Return the HTML string that Gradio will render in the DataFrame.
|
| 140 |
-
return f'<img src="{uri}" alt="{tooltip}" title="{tooltip}" style="width:24px; height:24px;">'
|
| 141 |
-
|
| 142 |
def create_svg_html(value, svg_map):
|
| 143 |
"""
|
| 144 |
Generates the absolute simplest HTML for an icon, without any extra text.
|
|
@@ -162,18 +93,12 @@ def create_svg_html(value, svg_map):
|
|
| 162 |
|
| 163 |
def build_openness_tooltip_content() -> str:
|
| 164 |
"""
|
| 165 |
-
Generates the inner HTML for the
|
| 166 |
"""
|
| 167 |
-
descriptions = {
|
| 168 |
-
aliases.CANONICAL_OPENNESS_OPEN_SOURCE_OPEN_WEIGHTS: "Both code and ML models are open",
|
| 169 |
-
aliases.CANONICAL_OPENNESS_OPEN_SOURCE_CLOSED_WEIGHTS: "Code is open but uses an ML model with closed-weights",
|
| 170 |
-
aliases.CANONICAL_OPENNESS_CLOSED_API_AVAILABLE: "No access to code; API access only",
|
| 171 |
-
aliases.CANONICAL_OPENNESS_CLOSED_UI_ONLY: "No access to code or API; UI access only",
|
| 172 |
-
}
|
| 173 |
html_items = []
|
| 174 |
for name, info in OPENNESS_SVG_MAP.items():
|
| 175 |
uri = get_svg_as_data_uri(info["path"])
|
| 176 |
-
desc =
|
| 177 |
|
| 178 |
html_items.append(f"""
|
| 179 |
<div class="tooltip-legend-item">
|
|
@@ -190,8 +115,8 @@ def build_openness_tooltip_content() -> str:
|
|
| 190 |
return f"""<span class="tooltip-icon-legend">
|
| 191 |
ⓘ
|
| 192 |
<span class="tooltip-card">
|
| 193 |
-
<h3>
|
| 194 |
-
<p class="tooltip-description">Indicates
|
| 195 |
<div class="tooltip-items-container">{joined_items}</div>
|
| 196 |
</span>
|
| 197 |
</span>"""
|
|
@@ -215,48 +140,7 @@ def build_pareto_tooltip_content() -> str:
|
|
| 215 |
</div>
|
| 216 |
"""
|
| 217 |
|
| 218 |
-
def build_tooling_tooltip_content() -> str:
|
| 219 |
-
"""Generates the inner HTML for the Agent Tooling tooltip card."""
|
| 220 |
-
descriptions = {
|
| 221 |
-
aliases.CANONICAL_TOOL_USAGE_STANDARD: "Uses only predefined tools from the evaluation environment (as defined in Inspect's state.tools).",
|
| 222 |
-
aliases.CANONICAL_TOOL_USAGE_CUSTOM_INTERFACE: "Custom tools for accessing an equivalent underlying environment:",
|
| 223 |
-
aliases.CANONICAL_TOOL_USAGE_FULLY_CUSTOM: f"Uses tools beyond constraints of {aliases.CANONICAL_TOOL_USAGE_STANDARD} or {aliases.CANONICAL_TOOL_USAGE_CUSTOM_INTERFACE}",
|
| 224 |
-
}
|
| 225 |
-
custom_interface_sub_list = """
|
| 226 |
-
<ul class="tooltip-sub-list">
|
| 227 |
-
<li>Literature tasks: Information access is limited to date-restricted usage of the Asta MCP tools.</li>
|
| 228 |
-
<li>Code tasks: Code execution is limited to an iPython shell in a machine environment initialized with the standard Asta sandbox Dockerfile (or equivalent).</li>
|
| 229 |
-
</ul>
|
| 230 |
-
"""
|
| 231 |
-
html_items = []
|
| 232 |
-
for name, info in TOOLING_SVG_MAP.items():
|
| 233 |
-
uri = get_svg_as_data_uri(info["path"])
|
| 234 |
-
desc = descriptions.get(name, "")
|
| 235 |
-
|
| 236 |
-
# Check if this is the special case that needs a sub-list
|
| 237 |
-
sub_list_html = custom_interface_sub_list if name == aliases.CANONICAL_TOOL_USAGE_CUSTOM_INTERFACE else ""
|
| 238 |
-
|
| 239 |
-
html_items.append(f"""
|
| 240 |
-
<div class="tooltip-legend-item">
|
| 241 |
-
<img src="{uri}" alt="{name}">
|
| 242 |
-
<div>
|
| 243 |
-
<strong>{name}</strong>
|
| 244 |
-
<span>{desc}</span>
|
| 245 |
-
{sub_list_html}
|
| 246 |
-
</div>
|
| 247 |
-
</div>
|
| 248 |
-
""")
|
| 249 |
-
|
| 250 |
-
joined_items = "".join(html_items)
|
| 251 |
|
| 252 |
-
return f"""<span class="tooltip-icon-legend">
|
| 253 |
-
ⓘ
|
| 254 |
-
<span class="tooltip-card">
|
| 255 |
-
<h3>Agent Tooling</h3>
|
| 256 |
-
<p class="tooltip-description">Describes the tool usage and execution environment of the agent during evaluation.</p>
|
| 257 |
-
<div class="tooltip-items-container">{joined_items}</div>
|
| 258 |
-
</span>
|
| 259 |
-
</span>"""
|
| 260 |
|
| 261 |
|
| 262 |
def build_descriptions_tooltip_content(table) -> str:
|
|
@@ -303,9 +187,6 @@ def build_descriptions_tooltip_content(table) -> str:
|
|
| 303 |
<div class="tooltip-description-item"><b>Logs:</b> View evaluation run logs (e.g., outputs, traces).</div>
|
| 304 |
"""
|
| 305 |
|
| 306 |
-
# Dynamically generate the correct HTML for the legend parts
|
| 307 |
-
openness_html = " ".join([create_svg_html(name, OPENNESS_SVG_MAP) for name in OPENNESS_SVG_MAP])
|
| 308 |
-
tooling_html = " ".join([create_svg_html(name, TOOLING_SVG_MAP) for name in TOOLING_SVG_MAP])
|
| 309 |
# Create HTML for the "Openness" legend items for table
|
| 310 |
openness_html_items = []
|
| 311 |
for name, info in OPENNESS_SVG_MAP.items():
|
|
@@ -319,21 +200,8 @@ for name, info in OPENNESS_SVG_MAP.items():
|
|
| 319 |
)
|
| 320 |
openness_html = " ".join(openness_html_items)
|
| 321 |
|
| 322 |
-
# Create HTML for the "Tooling" legend items for table
|
| 323 |
-
tooling_html_items = []
|
| 324 |
-
for name, info in TOOLING_SVG_MAP.items():
|
| 325 |
-
uri = get_svg_as_data_uri(info["path"])
|
| 326 |
-
tooling_html_items.append(
|
| 327 |
-
f'<div style="display: flex; align-items: center; white-space: nowrap;">'
|
| 328 |
-
f'<img src="{uri}" alt="{name}" title="{name}" style="width:16px; height:16px; margin-right: 4px; flex-shrink: 0;">'
|
| 329 |
-
f'<span>{name}</span>'
|
| 330 |
-
f'</div>'
|
| 331 |
-
)
|
| 332 |
-
tooling_html = " ".join(tooling_html_items)
|
| 333 |
-
|
| 334 |
pareto_tooltip_content = build_pareto_tooltip_content()
|
| 335 |
openness_tooltip_content = build_openness_tooltip_content()
|
| 336 |
-
tooling_tooltip_content = build_tooling_tooltip_content()
|
| 337 |
|
| 338 |
def create_legend_markdown(which_table: str) -> str:
|
| 339 |
"""
|
|
@@ -358,16 +226,10 @@ def create_legend_markdown(which_table: str) -> str:
|
|
| 358 |
</div>
|
| 359 |
|
| 360 |
<div> <!-- Container for the Openness section -->
|
| 361 |
-
<b>
|
| 362 |
{openness_tooltip_content}
|
| 363 |
<div class="table-legend-item">{openness_html}</div>
|
| 364 |
</div>
|
| 365 |
-
|
| 366 |
-
<div> <!-- Container for the Tooling section -->
|
| 367 |
-
<b>Agent Tooling</b>
|
| 368 |
-
{tooling_tooltip_content}
|
| 369 |
-
<div class="table-legend-item">{tooling_html}</div>
|
| 370 |
-
</div>
|
| 371 |
|
| 372 |
<div><!-- Container for the Column Descriptions section -->
|
| 373 |
<b>Column Descriptions</b>
|
|
@@ -400,22 +262,6 @@ for name, info in OPENNESS_SVG_MAP.items():
|
|
| 400 |
f'</div>'
|
| 401 |
)
|
| 402 |
|
| 403 |
-
tooling_legend_items = []
|
| 404 |
-
for name, info in TOOLING_SVG_MAP.items():
|
| 405 |
-
uri = get_svg_as_data_uri(info["path"])
|
| 406 |
-
if uri:
|
| 407 |
-
tooling_legend_items.append(
|
| 408 |
-
f'<div class="plot-legend-item">'
|
| 409 |
-
f'<img class="plot-legend-item-svg plot-legend-tooling-svg" src="{uri}" alt="{name}" title="{name}">'
|
| 410 |
-
f'<div class="plot-legend-item-text">'
|
| 411 |
-
f'<div>'
|
| 412 |
-
f'<span>{name}</span>'
|
| 413 |
-
f'</div>'
|
| 414 |
-
f'<span class="description">{info["description"]}</span>'
|
| 415 |
-
f'</div>'
|
| 416 |
-
f'</div>'
|
| 417 |
-
)
|
| 418 |
-
|
| 419 |
plot_legend_html = f"""
|
| 420 |
<div class="plot-legend-container">
|
| 421 |
<div id="plot-legend-logo">
|
|
@@ -430,16 +276,10 @@ plot_legend_html = f"""
|
|
| 430 |
</div>
|
| 431 |
</div>
|
| 432 |
</div>
|
| 433 |
-
<div style="margin-bottom: 16px;">
|
| 434 |
-
<span class="plot-legend-category-heading">Agent Openness</span>
|
| 435 |
-
<div style="margin-top: 8px;">
|
| 436 |
-
{''.join(openness_legend_items)}
|
| 437 |
-
</div>
|
| 438 |
-
</div>
|
| 439 |
<div>
|
| 440 |
-
<span class="plot-legend-category-heading">
|
| 441 |
<div style="margin-top: 8px;">
|
| 442 |
-
{''.join(
|
| 443 |
</div>
|
| 444 |
</div>
|
| 445 |
</div>
|
|
@@ -666,7 +506,7 @@ def create_benchmark_details_display(
|
|
| 666 |
benchmark_cost_col = f"{benchmark_name} Cost"
|
| 667 |
|
| 668 |
# Define the columns needed for the detailed table
|
| 669 |
-
table_cols = ['Agent','Source','Openness',
|
| 670 |
|
| 671 |
# Filter to only columns that actually exist in the full dataframe
|
| 672 |
existing_table_cols = [col for col in table_cols if col in full_df.columns]
|
|
@@ -690,10 +530,13 @@ def create_benchmark_details_display(
|
|
| 690 |
axis=1
|
| 691 |
)
|
| 692 |
|
| 693 |
-
|
| 694 |
-
|
| 695 |
-
|
| 696 |
-
|
|
|
|
|
|
|
|
|
|
| 697 |
|
| 698 |
#Make pretty and format the Models Used column
|
| 699 |
benchmark_table_df['Models Used'] = benchmark_table_df['Models Used'].apply(clean_llm_base_list)
|
|
|
|
| 36 |
|
| 37 |
api = HfApi()
|
| 38 |
os.makedirs(EXTRACTED_DATA_DIR, exist_ok=True)
|
| 39 |
+
# Simplified icon map (no tooling distinction, only openness)
|
| 40 |
+
# Not actually used since we removed icons from the table, but keeping for potential future use
|
| 41 |
+
OPENNESS_ICON_MAP = {
|
| 42 |
+
aliases.CANONICAL_OPENNESS_OPEN: "assets/ellipse-pink.svg",
|
| 43 |
+
aliases.CANONICAL_OPENNESS_CLOSED: "assets/ellipse-yellow.svg",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
}
|
| 45 |
|
| 46 |
+
# Add aliases
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
for canonical_openness, openness_aliases in aliases.OPENNESS_ALIASES.items():
|
| 48 |
for openness_alias in openness_aliases:
|
| 49 |
+
OPENNESS_ICON_MAP[openness_alias] = OPENNESS_ICON_MAP[canonical_openness]
|
| 50 |
|
| 51 |
|
| 52 |
OPENNESS_SVG_MAP = {
|
| 53 |
+
aliases.CANONICAL_OPENNESS_OPEN: {
|
| 54 |
"path": "assets/ellipse-pink.svg",
|
| 55 |
+
"description": "Open source model"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
},
|
| 57 |
+
aliases.CANONICAL_OPENNESS_CLOSED: {
|
| 58 |
"path": "assets/ellipse-yellow.svg",
|
| 59 |
+
"description": "Closed source model"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
},
|
| 61 |
}
|
| 62 |
|
|
|
|
| 70 |
print(f"Warning: SVG file not found at {path}")
|
| 71 |
return ""
|
| 72 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
def create_svg_html(value, svg_map):
|
| 74 |
"""
|
| 75 |
Generates the absolute simplest HTML for an icon, without any extra text.
|
|
|
|
| 93 |
|
| 94 |
def build_openness_tooltip_content() -> str:
|
| 95 |
"""
|
| 96 |
+
Generates the inner HTML for the Model Openness tooltip card.
|
| 97 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 98 |
html_items = []
|
| 99 |
for name, info in OPENNESS_SVG_MAP.items():
|
| 100 |
uri = get_svg_as_data_uri(info["path"])
|
| 101 |
+
desc = info["description"]
|
| 102 |
|
| 103 |
html_items.append(f"""
|
| 104 |
<div class="tooltip-legend-item">
|
|
|
|
| 115 |
return f"""<span class="tooltip-icon-legend">
|
| 116 |
ⓘ
|
| 117 |
<span class="tooltip-card">
|
| 118 |
+
<h3>Model Openness</h3>
|
| 119 |
+
<p class="tooltip-description">Indicates whether the language model is open source or closed source.</p>
|
| 120 |
<div class="tooltip-items-container">{joined_items}</div>
|
| 121 |
</span>
|
| 122 |
</span>"""
|
|
|
|
| 140 |
</div>
|
| 141 |
"""
|
| 142 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 143 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 144 |
|
| 145 |
|
| 146 |
def build_descriptions_tooltip_content(table) -> str:
|
|
|
|
| 187 |
<div class="tooltip-description-item"><b>Logs:</b> View evaluation run logs (e.g., outputs, traces).</div>
|
| 188 |
"""
|
| 189 |
|
|
|
|
|
|
|
|
|
|
| 190 |
# Create HTML for the "Openness" legend items for table
|
| 191 |
openness_html_items = []
|
| 192 |
for name, info in OPENNESS_SVG_MAP.items():
|
|
|
|
| 200 |
)
|
| 201 |
openness_html = " ".join(openness_html_items)
|
| 202 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 203 |
pareto_tooltip_content = build_pareto_tooltip_content()
|
| 204 |
openness_tooltip_content = build_openness_tooltip_content()
|
|
|
|
| 205 |
|
| 206 |
def create_legend_markdown(which_table: str) -> str:
|
| 207 |
"""
|
|
|
|
| 226 |
</div>
|
| 227 |
|
| 228 |
<div> <!-- Container for the Openness section -->
|
| 229 |
+
<b>Model Openness</b>
|
| 230 |
{openness_tooltip_content}
|
| 231 |
<div class="table-legend-item">{openness_html}</div>
|
| 232 |
</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 233 |
|
| 234 |
<div><!-- Container for the Column Descriptions section -->
|
| 235 |
<b>Column Descriptions</b>
|
|
|
|
| 262 |
f'</div>'
|
| 263 |
)
|
| 264 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 265 |
plot_legend_html = f"""
|
| 266 |
<div class="plot-legend-container">
|
| 267 |
<div id="plot-legend-logo">
|
|
|
|
| 276 |
</div>
|
| 277 |
</div>
|
| 278 |
</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 279 |
<div>
|
| 280 |
+
<span class="plot-legend-category-heading">Model Openness</span>
|
| 281 |
<div style="margin-top: 8px;">
|
| 282 |
+
{''.join(openness_legend_items)}
|
| 283 |
</div>
|
| 284 |
</div>
|
| 285 |
</div>
|
|
|
|
| 506 |
benchmark_cost_col = f"{benchmark_name} Cost"
|
| 507 |
|
| 508 |
# Define the columns needed for the detailed table
|
| 509 |
+
table_cols = ['Agent','Source','Openness', 'Date', benchmark_score_col, benchmark_cost_col,'Logs','id', 'Models Used']
|
| 510 |
|
| 511 |
# Filter to only columns that actually exist in the full dataframe
|
| 512 |
existing_table_cols = [col for col in table_cols if col in full_df.columns]
|
|
|
|
| 530 |
axis=1
|
| 531 |
)
|
| 532 |
|
| 533 |
+
# Create simple openness icons
|
| 534 |
+
def get_openness_icon_html(row):
|
| 535 |
+
openness_val = row.get('Openness', '')
|
| 536 |
+
uri = get_svg_as_data_uri(OPENNESS_ICON_MAP.get(openness_val, "assets/ellipse-pink.svg"))
|
| 537 |
+
return f'<img src="{uri}" alt="{openness_val}" title="{openness_val}" style="width:24px; height:24px;">'
|
| 538 |
+
|
| 539 |
+
benchmark_table_df['Icon'] = benchmark_table_df.apply(get_openness_icon_html, axis=1)
|
| 540 |
|
| 541 |
#Make pretty and format the Models Used column
|
| 542 |
benchmark_table_df['Models Used'] = benchmark_table_df['Models Used'].apply(clean_llm_base_list)
|