openhands commited on
Commit
ca754bb
·
1 Parent(s): 701c496

Simplify leaderboard to open vs closed models only

Browse files

Remove agent tooling distinction from the leaderboard:
- Simplified openness from 4 categories to 2: 'open' and 'closed'
- Removed all 'tool_usage' references from data structures
- Updated UI to remove Agent Tooling column and icons
- Simplified scatter plots to use only 2 colors (open/closed)
- Updated legends to show Model Openness instead of Agent Openness/Tooling
- Regenerated mock data without tool_usage field
- Added openness normalization to handle legacy values

Files modified:
- aliases.py: Simplified openness constants, removed tool usage
- simple_data_loader.py: Added openness normalization, removed tool_usage loading
- leaderboard_transformer.py: Removed Agent Tooling column, simplified color mapping
- ui_components.py: Removed tooling icons/tooltips, simplified to openness only
- generate_mock_jsonl.py: Removed tool_usage from mock data generation

aliases.py CHANGED
@@ -1,22 +1,18 @@
1
- # Define constants that were previously imported from agenteval
2
- CANONICAL_OPENNESS_OPEN_SOURCE_OPEN_WEIGHTS = "open_source_open_weights"
3
- CANONICAL_OPENNESS_OPEN_SOURCE_CLOSED_WEIGHTS = "open_source_closed_weights"
4
- CANONICAL_OPENNESS_CLOSED_API_AVAILABLE = "closed_api_available"
5
- CANONICAL_OPENNESS_CLOSED_UI_ONLY = "closed_ui_only"
6
- CANONICAL_TOOL_USAGE_STANDARD = "standard"
7
- CANONICAL_TOOL_USAGE_CUSTOM_INTERFACE = "custom_interface"
8
- CANONICAL_TOOL_USAGE_FULLY_CUSTOM = "fully_custom"
9
 
10
-
11
- OPENNESS_ALIASES = {
12
- CANONICAL_OPENNESS_OPEN_SOURCE_OPEN_WEIGHTS: {"Open Source + Open Weights"},
13
- CANONICAL_OPENNESS_OPEN_SOURCE_CLOSED_WEIGHTS: {"Open Source"},
14
- CANONICAL_OPENNESS_CLOSED_API_AVAILABLE: {"API Available"},
15
- CANONICAL_OPENNESS_CLOSED_UI_ONLY: {"Closed"}
 
 
16
  }
17
 
18
- TOOL_USAGE_ALIASES = {
19
- CANONICAL_TOOL_USAGE_STANDARD: {},
20
- CANONICAL_TOOL_USAGE_CUSTOM_INTERFACE: {"Custom with Standard Search"},
21
- CANONICAL_TOOL_USAGE_FULLY_CUSTOM: {"Fully Custom"}
22
  }
 
1
+ # Define constants for openness categories
2
+ CANONICAL_OPENNESS_OPEN = "open"
3
+ CANONICAL_OPENNESS_CLOSED = "closed"
 
 
 
 
 
4
 
5
+ # Map old openness values to new simplified values
6
+ OPENNESS_MAPPING = {
7
+ "open_source_open_weights": CANONICAL_OPENNESS_OPEN,
8
+ "open_source_closed_weights": CANONICAL_OPENNESS_OPEN,
9
+ "closed_api_available": CANONICAL_OPENNESS_CLOSED,
10
+ "closed_ui_only": CANONICAL_OPENNESS_CLOSED,
11
+ "open": CANONICAL_OPENNESS_OPEN,
12
+ "closed": CANONICAL_OPENNESS_CLOSED,
13
  }
14
 
15
+ OPENNESS_ALIASES = {
16
+ CANONICAL_OPENNESS_OPEN: {"Open", "Open Source", "Open Source + Open Weights"},
17
+ CANONICAL_OPENNESS_CLOSED: {"Closed", "API Available"}
 
18
  }
generate_mock_jsonl.py CHANGED
@@ -43,8 +43,7 @@ MOCK_AGENTS = [
43
  {
44
  "agent_name": "1.0.2",
45
  "llm_base": "claude-3-5-sonnet-20241022",
46
- "openness": "closed_api_available",
47
- "tool_usage": "standard",
48
  "scores": {
49
  "swe-bench": 48.3,
50
  "multi-swe-bench": 35.2,
@@ -57,8 +56,7 @@ MOCK_AGENTS = [
57
  {
58
  "agent_name": "1.0.1",
59
  "llm_base": "gpt-4o-2024-11-20",
60
- "openness": "closed_api_available",
61
- "tool_usage": "standard",
62
  "scores": {
63
  "swe-bench": 45.1,
64
  "multi-swe-bench": 32.8,
@@ -71,8 +69,7 @@ MOCK_AGENTS = [
71
  {
72
  "agent_name": "1.0.0",
73
  "llm_base": "gpt-4-turbo-2024-04-09",
74
- "openness": "closed_api_available",
75
- "tool_usage": "standard",
76
  "scores": {
77
  "swe-bench": 38.7,
78
  "multi-swe-bench": 28.4,
@@ -85,8 +82,7 @@ MOCK_AGENTS = [
85
  {
86
  "agent_name": "0.9.5",
87
  "llm_base": "gpt-4o-mini-2024-07-18",
88
- "openness": "closed_api_available",
89
- "tool_usage": "standard",
90
  "scores": {
91
  "swe-bench": 32.5,
92
  "multi-swe-bench": 24.1,
@@ -99,8 +95,7 @@ MOCK_AGENTS = [
99
  {
100
  "agent_name": "0.9.0",
101
  "llm_base": "claude-3-opus-20240229",
102
- "openness": "closed_api_available",
103
- "tool_usage": "custom_interface",
104
  "scores": {
105
  "swe-bench": 29.8,
106
  "multi-swe-bench": 21.5,
@@ -148,7 +143,6 @@ def generate_mock_data():
148
  "agent_name": agent["agent_name"],
149
  "llm_base": agent["llm_base"],
150
  "openness": agent["openness"],
151
- "tool_usage": agent["tool_usage"],
152
  "score": agent["scores"][benchmark_name],
153
  "metric": benchmark_info["metric"],
154
  "submission_time": datetime.now().isoformat(),
 
43
  {
44
  "agent_name": "1.0.2",
45
  "llm_base": "claude-3-5-sonnet-20241022",
46
+ "openness": "closed",
 
47
  "scores": {
48
  "swe-bench": 48.3,
49
  "multi-swe-bench": 35.2,
 
56
  {
57
  "agent_name": "1.0.1",
58
  "llm_base": "gpt-4o-2024-11-20",
59
+ "openness": "closed",
 
60
  "scores": {
61
  "swe-bench": 45.1,
62
  "multi-swe-bench": 32.8,
 
69
  {
70
  "agent_name": "1.0.0",
71
  "llm_base": "gpt-4-turbo-2024-04-09",
72
+ "openness": "closed",
 
73
  "scores": {
74
  "swe-bench": 38.7,
75
  "multi-swe-bench": 28.4,
 
82
  {
83
  "agent_name": "0.9.5",
84
  "llm_base": "gpt-4o-mini-2024-07-18",
85
+ "openness": "closed",
 
86
  "scores": {
87
  "swe-bench": 32.5,
88
  "multi-swe-bench": 24.1,
 
95
  {
96
  "agent_name": "0.9.0",
97
  "llm_base": "claude-3-opus-20240229",
98
+ "openness": "closed",
 
99
  "scores": {
100
  "swe-bench": 29.8,
101
  "multi-swe-bench": 21.5,
 
143
  "agent_name": agent["agent_name"],
144
  "llm_base": agent["llm_base"],
145
  "openness": agent["openness"],
 
146
  "score": agent["scores"][benchmark_name],
147
  "metric": benchmark_info["metric"],
148
  "submission_time": datetime.now().isoformat(),
leaderboard_transformer.py CHANGED
@@ -110,7 +110,6 @@ def _pretty_column_name(raw_col: str) -> str:
110
  'Overall cost': 'Overall Cost',
111
  'Logs': 'Logs',
112
  'Openness': 'Openness',
113
- 'Agent tooling': 'Agent Tooling',
114
  'LLM base': 'Model',
115
  'Source': 'Source',
116
  }
@@ -255,9 +254,9 @@ class DataTransformer:
255
 
256
  df_view = df_sorted.copy()
257
 
258
- # --- 3. Add Columns for Agent Openness and Tooling ---
259
  base_cols = ["id","Agent","Submitter","Models Used","Source"]
260
- new_cols = ["Openness", "Agent Tooling"]
261
  ending_cols = ["Date", "Logs"]
262
 
263
  metrics_to_display = [primary_score_col, f"{primary_metric} Cost"]
@@ -331,13 +330,10 @@ def _plot_scatter_plotly(
331
  ) -> go.Figure:
332
 
333
  # --- Section 1: Define Mappings ---
334
- # These include aliases for openness categories,
335
- # so multiple names might correspond to the same color.
336
  color_map = {
337
- aliases.CANONICAL_OPENNESS_OPEN_SOURCE_OPEN_WEIGHTS: "deeppink",
338
- aliases.CANONICAL_OPENNESS_OPEN_SOURCE_CLOSED_WEIGHTS: "coral",
339
- aliases.CANONICAL_OPENNESS_CLOSED_API_AVAILABLE: "yellow",
340
- aliases.CANONICAL_OPENNESS_CLOSED_UI_ONLY: "white",
341
  }
342
  for canonical_openness, openness_aliases in aliases.OPENNESS_ALIASES.items():
343
  for openness_alias in openness_aliases:
@@ -346,26 +342,15 @@ def _plot_scatter_plotly(
346
  colors_for_legend = set(aliases.OPENNESS_ALIASES.keys())
347
  category_order = list(color_map.keys())
348
 
349
- # These include aliases for tool usage categories,
350
- # so multiple names might correspond to the same shape.
351
- shape_map = {
352
- aliases.CANONICAL_TOOL_USAGE_STANDARD: "star",
353
- aliases.CANONICAL_TOOL_USAGE_CUSTOM_INTERFACE: "star-diamond",
354
- aliases.CANONICAL_TOOL_USAGE_FULLY_CUSTOM: "star-triangle-up",
355
- }
356
- for canonical_tool_usage, tool_usages_aliases in aliases.TOOL_USAGE_ALIASES.items():
357
- for tool_usage_alias in tool_usages_aliases:
358
- shape_map[tool_usage_alias] = shape_map[canonical_tool_usage]
359
- default_shape = 'square'
360
- # Only keep one name per shape for the legend.
361
- shapes_for_legend = set(aliases.TOOL_USAGE_ALIASES.keys())
362
 
363
  x_col_to_use = x
364
  y_col_to_use = y
365
  llm_base = data["Models Used"] if "Models Used" in data.columns else "Models Used"
366
 
367
  # --- Section 2: Data Preparation---
368
- required_cols = [y_col_to_use, agent_col, "Openness", "Agent Tooling"]
369
  if not all(col in data.columns for col in required_cols):
370
  logger.error(f"Missing one or more required columns for plotting: {required_cols}")
371
  return go.Figure()
@@ -411,7 +396,7 @@ def _plot_scatter_plotly(
411
  data_plot[x_col_to_use] = 0
412
 
413
  # Clean data based on all necessary columns
414
- data_plot.dropna(subset=[y_col_to_use, x_col_to_use, "Openness", "Agent Tooling"], inplace=True)
415
 
416
  # --- Section 3: Initialize Figure ---
417
  fig = go.Figure()
@@ -458,8 +443,7 @@ def _plot_scatter_plotly(
458
  parts.append(f"{h_pad}{x_axis_label}: <b>Missing</b>{h_pad}<br>")
459
  else:
460
  parts.append(f"{h_pad}{x_axis_label}: <b>${row[x_col]:.2f}</b>{h_pad}<br>")
461
- parts.append(f"{h_pad}Openness: <b>{row['Openness']}</b>{h_pad}<br>")
462
- parts.append(f"{h_pad}Tooling: <b>{row['Agent Tooling']}</b>{h_pad}")
463
 
464
  # Add extra vertical space (line spacing) before the next section
465
  parts.append("<br>")
@@ -491,7 +475,8 @@ def _plot_scatter_plotly(
491
  ),
492
  axis=1
493
  )
494
- data_plot['shape_symbol'] = data_plot['Agent Tooling'].map(shape_map).fillna(default_shape)
 
495
 
496
  # --- Section 6: Plot Markers by "Openness" Category ---
497
  for category in category_order:
@@ -509,7 +494,7 @@ def _plot_scatter_plotly(
509
  hoverinfo='text',
510
  marker=dict(
511
  color=color_map.get(category, 'black'),
512
- symbol=group['shape_symbol'],
513
  size=15,
514
  opacity=0.8,
515
  line=dict(width=1, color='deeppink')
 
110
  'Overall cost': 'Overall Cost',
111
  'Logs': 'Logs',
112
  'Openness': 'Openness',
 
113
  'LLM base': 'Model',
114
  'Source': 'Source',
115
  }
 
254
 
255
  df_view = df_sorted.copy()
256
 
257
+ # --- 3. Add Columns for Agent Openness ---
258
  base_cols = ["id","Agent","Submitter","Models Used","Source"]
259
+ new_cols = ["Openness"]
260
  ending_cols = ["Date", "Logs"]
261
 
262
  metrics_to_display = [primary_score_col, f"{primary_metric} Cost"]
 
330
  ) -> go.Figure:
331
 
332
  # --- Section 1: Define Mappings ---
333
+ # Map openness to colors (simplified: open vs closed)
 
334
  color_map = {
335
+ aliases.CANONICAL_OPENNESS_OPEN: "deeppink",
336
+ aliases.CANONICAL_OPENNESS_CLOSED: "yellow",
 
 
337
  }
338
  for canonical_openness, openness_aliases in aliases.OPENNESS_ALIASES.items():
339
  for openness_alias in openness_aliases:
 
342
  colors_for_legend = set(aliases.OPENNESS_ALIASES.keys())
343
  category_order = list(color_map.keys())
344
 
345
+ # Use consistent marker shape (no tooling distinction)
346
+ default_shape = 'circle'
 
 
 
 
 
 
 
 
 
 
 
347
 
348
  x_col_to_use = x
349
  y_col_to_use = y
350
  llm_base = data["Models Used"] if "Models Used" in data.columns else "Models Used"
351
 
352
  # --- Section 2: Data Preparation---
353
+ required_cols = [y_col_to_use, agent_col, "Openness"]
354
  if not all(col in data.columns for col in required_cols):
355
  logger.error(f"Missing one or more required columns for plotting: {required_cols}")
356
  return go.Figure()
 
396
  data_plot[x_col_to_use] = 0
397
 
398
  # Clean data based on all necessary columns
399
+ data_plot.dropna(subset=[y_col_to_use, x_col_to_use, "Openness"], inplace=True)
400
 
401
  # --- Section 3: Initialize Figure ---
402
  fig = go.Figure()
 
443
  parts.append(f"{h_pad}{x_axis_label}: <b>Missing</b>{h_pad}<br>")
444
  else:
445
  parts.append(f"{h_pad}{x_axis_label}: <b>${row[x_col]:.2f}</b>{h_pad}<br>")
446
+ parts.append(f"{h_pad}Openness: <b>{row['Openness']}</b>{h_pad}")
 
447
 
448
  # Add extra vertical space (line spacing) before the next section
449
  parts.append("<br>")
 
475
  ),
476
  axis=1
477
  )
478
+ # Use consistent shape for all points (no tooling distinction)
479
+ data_plot['shape_symbol'] = default_shape
480
 
481
  # --- Section 6: Plot Markers by "Openness" Category ---
482
  for category in category_order:
 
494
  hoverinfo='text',
495
  marker=dict(
496
  color=color_map.get(category, 'black'),
497
+ symbol=default_shape,
498
  size=15,
499
  opacity=0.8,
500
  line=dict(width=1, color='deeppink')
mock_results/1.0.0-dev1/commit0.jsonl CHANGED
@@ -1,5 +1,5 @@
1
- {"agent_name": "1.0.2", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 71.2, "metric": "test_pass_rate", "submission_time": "2025-11-24T23:38:02.231539", "tags": ["commit0"], "total_cost": 45.6, "total_runtime": 656.0}
2
- {"agent_name": "1.0.1", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 68.9, "metric": "test_pass_rate", "submission_time": "2025-11-24T23:38:02.231556", "tags": ["commit0"], "total_cost": 44.45, "total_runtime": 644.5}
3
- {"agent_name": "1.0.0", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 61.5, "metric": "test_pass_rate", "submission_time": "2025-11-24T23:38:02.231563", "tags": ["commit0"], "total_cost": 40.75, "total_runtime": 607.5}
4
- {"agent_name": "0.9.5", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 55.3, "metric": "test_pass_rate", "submission_time": "2025-11-24T23:38:02.231569", "tags": ["commit0"], "total_cost": 37.65, "total_runtime": 576.5}
5
- {"agent_name": "0.9.0", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 52.1, "metric": "test_pass_rate", "submission_time": "2025-11-24T23:38:02.231574", "tags": ["commit0"], "total_cost": 36.05, "total_runtime": 560.5}
 
1
+ {"agent_name": "1.0.2", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed", "score": 71.2, "metric": "test_pass_rate", "submission_time": "2025-11-25T00:12:22.972910", "tags": ["commit0"], "total_cost": 45.6, "total_runtime": 656.0}
2
+ {"agent_name": "1.0.1", "llm_base": "gpt-4o-2024-11-20", "openness": "closed", "score": 68.9, "metric": "test_pass_rate", "submission_time": "2025-11-25T00:12:22.972929", "tags": ["commit0"], "total_cost": 44.45, "total_runtime": 644.5}
3
+ {"agent_name": "1.0.0", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed", "score": 61.5, "metric": "test_pass_rate", "submission_time": "2025-11-25T00:12:22.972939", "tags": ["commit0"], "total_cost": 40.75, "total_runtime": 607.5}
4
+ {"agent_name": "0.9.5", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed", "score": 55.3, "metric": "test_pass_rate", "submission_time": "2025-11-25T00:12:22.972947", "tags": ["commit0"], "total_cost": 37.65, "total_runtime": 576.5}
5
+ {"agent_name": "0.9.0", "llm_base": "claude-3-opus-20240229", "openness": "closed", "score": 52.1, "metric": "test_pass_rate", "submission_time": "2025-11-25T00:12:22.972954", "tags": ["commit0"], "total_cost": 36.05, "total_runtime": 560.5}
mock_results/1.0.0-dev1/gaia.jsonl CHANGED
@@ -1,5 +1,5 @@
1
- {"agent_name": "1.0.2", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 58.7, "metric": "accuracy", "submission_time": "2025-11-24T23:38:02.231711", "tags": ["gaia"], "total_cost": 39.35, "total_runtime": 593.5}
2
- {"agent_name": "1.0.1", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 55.2, "metric": "accuracy", "submission_time": "2025-11-24T23:38:02.231728", "tags": ["gaia"], "total_cost": 37.6, "total_runtime": 576.0}
3
- {"agent_name": "1.0.0", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 48.3, "metric": "accuracy", "submission_time": "2025-11-24T23:38:02.231735", "tags": ["gaia"], "total_cost": 34.15, "total_runtime": 541.5}
4
- {"agent_name": "0.9.5", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 42.1, "metric": "accuracy", "submission_time": "2025-11-24T23:38:02.231741", "tags": ["gaia"], "total_cost": 31.05, "total_runtime": 510.5}
5
- {"agent_name": "0.9.0", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 39.4, "metric": "accuracy", "submission_time": "2025-11-24T23:38:02.231749", "tags": ["gaia"], "total_cost": 29.7, "total_runtime": 497.0}
 
1
+ {"agent_name": "1.0.2", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed", "score": 58.7, "metric": "accuracy", "submission_time": "2025-11-25T00:12:22.973093", "tags": ["gaia"], "total_cost": 39.35, "total_runtime": 593.5}
2
+ {"agent_name": "1.0.1", "llm_base": "gpt-4o-2024-11-20", "openness": "closed", "score": 55.2, "metric": "accuracy", "submission_time": "2025-11-25T00:12:22.973111", "tags": ["gaia"], "total_cost": 37.6, "total_runtime": 576.0}
3
+ {"agent_name": "1.0.0", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed", "score": 48.3, "metric": "accuracy", "submission_time": "2025-11-25T00:12:22.973121", "tags": ["gaia"], "total_cost": 34.15, "total_runtime": 541.5}
4
+ {"agent_name": "0.9.5", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed", "score": 42.1, "metric": "accuracy", "submission_time": "2025-11-25T00:12:22.973129", "tags": ["gaia"], "total_cost": 31.05, "total_runtime": 510.5}
5
+ {"agent_name": "0.9.0", "llm_base": "claude-3-opus-20240229", "openness": "closed", "score": 39.4, "metric": "accuracy", "submission_time": "2025-11-25T00:12:22.973137", "tags": ["gaia"], "total_cost": 29.7, "total_runtime": 497.0}
mock_results/1.0.0-dev1/multi-swe-bench.jsonl CHANGED
@@ -1,5 +1,5 @@
1
- {"agent_name": "1.0.2", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 35.2, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.230869", "tags": ["multi-swe-bench"], "total_cost": 27.6, "total_runtime": 476.0}
2
- {"agent_name": "1.0.1", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 32.8, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.230889", "tags": ["multi-swe-bench"], "total_cost": 26.4, "total_runtime": 464.0}
3
- {"agent_name": "1.0.0", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 28.4, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.230899", "tags": ["multi-swe-bench"], "total_cost": 24.2, "total_runtime": 442.0}
4
- {"agent_name": "0.9.5", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 24.1, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.230908", "tags": ["multi-swe-bench"], "total_cost": 22.05, "total_runtime": 420.5}
5
- {"agent_name": "0.9.0", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 21.5, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.230917", "tags": ["multi-swe-bench"], "total_cost": 20.75, "total_runtime": 407.5}
 
1
+ {"agent_name": "1.0.2", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed", "score": 35.2, "metric": "resolve_rate", "submission_time": "2025-11-25T00:12:22.972368", "tags": ["multi-swe-bench"], "total_cost": 27.6, "total_runtime": 476.0}
2
+ {"agent_name": "1.0.1", "llm_base": "gpt-4o-2024-11-20", "openness": "closed", "score": 32.8, "metric": "resolve_rate", "submission_time": "2025-11-25T00:12:22.972389", "tags": ["multi-swe-bench"], "total_cost": 26.4, "total_runtime": 464.0}
3
+ {"agent_name": "1.0.0", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed", "score": 28.4, "metric": "resolve_rate", "submission_time": "2025-11-25T00:12:22.972400", "tags": ["multi-swe-bench"], "total_cost": 24.2, "total_runtime": 442.0}
4
+ {"agent_name": "0.9.5", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed", "score": 24.1, "metric": "resolve_rate", "submission_time": "2025-11-25T00:12:22.972408", "tags": ["multi-swe-bench"], "total_cost": 22.05, "total_runtime": 420.5}
5
+ {"agent_name": "0.9.0", "llm_base": "claude-3-opus-20240229", "openness": "closed", "score": 21.5, "metric": "resolve_rate", "submission_time": "2025-11-25T00:12:22.972416", "tags": ["multi-swe-bench"], "total_cost": 20.75, "total_runtime": 407.5}
mock_results/1.0.0-dev1/swe-bench-multimodal.jsonl CHANGED
@@ -1,5 +1,5 @@
1
- {"agent_name": "1.0.2", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 42.1, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.231057", "tags": ["swe-bench-multimodal"], "total_cost": 31.05, "total_runtime": 510.5}
2
- {"agent_name": "1.0.1", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 39.5, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.231073", "tags": ["swe-bench-multimodal"], "total_cost": 29.75, "total_runtime": 497.5}
3
- {"agent_name": "1.0.0", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 34.2, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.231082", "tags": ["swe-bench-multimodal"], "total_cost": 27.1, "total_runtime": 471.0}
4
- {"agent_name": "0.9.5", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 28.9, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.231088", "tags": ["swe-bench-multimodal"], "total_cost": 24.45, "total_runtime": 444.5}
5
- {"agent_name": "0.9.0", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 25.7, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.231093", "tags": ["swe-bench-multimodal"], "total_cost": 22.85, "total_runtime": 428.5}
 
1
+ {"agent_name": "1.0.2", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed", "score": 42.1, "metric": "resolve_rate", "submission_time": "2025-11-25T00:12:22.972550", "tags": ["swe-bench-multimodal"], "total_cost": 31.05, "total_runtime": 510.5}
2
+ {"agent_name": "1.0.1", "llm_base": "gpt-4o-2024-11-20", "openness": "closed", "score": 39.5, "metric": "resolve_rate", "submission_time": "2025-11-25T00:12:22.972567", "tags": ["swe-bench-multimodal"], "total_cost": 29.75, "total_runtime": 497.5}
3
+ {"agent_name": "1.0.0", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed", "score": 34.2, "metric": "resolve_rate", "submission_time": "2025-11-25T00:12:22.972577", "tags": ["swe-bench-multimodal"], "total_cost": 27.1, "total_runtime": 471.0}
4
+ {"agent_name": "0.9.5", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed", "score": 28.9, "metric": "resolve_rate", "submission_time": "2025-11-25T00:12:22.972585", "tags": ["swe-bench-multimodal"], "total_cost": 24.45, "total_runtime": 444.5}
5
+ {"agent_name": "0.9.0", "llm_base": "claude-3-opus-20240229", "openness": "closed", "score": 25.7, "metric": "resolve_rate", "submission_time": "2025-11-25T00:12:22.972593", "tags": ["swe-bench-multimodal"], "total_cost": 22.85, "total_runtime": 428.5}
mock_results/1.0.0-dev1/swe-bench.jsonl CHANGED
@@ -1,5 +1,5 @@
1
- {"agent_name": "1.0.2", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 48.3, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.230638", "tags": ["swe-bench"], "total_cost": 34.15, "total_runtime": 541.5}
2
- {"agent_name": "1.0.1", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 45.1, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.230668", "tags": ["swe-bench"], "total_cost": 32.55, "total_runtime": 525.5}
3
- {"agent_name": "1.0.0", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 38.7, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.230681", "tags": ["swe-bench"], "total_cost": 29.35, "total_runtime": 493.5}
4
- {"agent_name": "0.9.5", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 32.5, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.230689", "tags": ["swe-bench"], "total_cost": 26.25, "total_runtime": 462.5}
5
- {"agent_name": "0.9.0", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 29.8, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.230696", "tags": ["swe-bench"], "total_cost": 24.9, "total_runtime": 449.0}
 
1
+ {"agent_name": "1.0.2", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed", "score": 48.3, "metric": "resolve_rate", "submission_time": "2025-11-25T00:12:22.972101", "tags": ["swe-bench"], "total_cost": 34.15, "total_runtime": 541.5}
2
+ {"agent_name": "1.0.1", "llm_base": "gpt-4o-2024-11-20", "openness": "closed", "score": 45.1, "metric": "resolve_rate", "submission_time": "2025-11-25T00:12:22.972136", "tags": ["swe-bench"], "total_cost": 32.55, "total_runtime": 525.5}
3
+ {"agent_name": "1.0.0", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed", "score": 38.7, "metric": "resolve_rate", "submission_time": "2025-11-25T00:12:22.972167", "tags": ["swe-bench"], "total_cost": 29.35, "total_runtime": 493.5}
4
+ {"agent_name": "0.9.5", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed", "score": 32.5, "metric": "resolve_rate", "submission_time": "2025-11-25T00:12:22.972178", "tags": ["swe-bench"], "total_cost": 26.25, "total_runtime": 462.5}
5
+ {"agent_name": "0.9.0", "llm_base": "claude-3-opus-20240229", "openness": "closed", "score": 29.8, "metric": "resolve_rate", "submission_time": "2025-11-25T00:12:22.972186", "tags": ["swe-bench"], "total_cost": 24.9, "total_runtime": 449.0}
mock_results/1.0.0-dev1/swt-bench.jsonl CHANGED
@@ -1,5 +1,5 @@
1
- {"agent_name": "1.0.2", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 65.4, "metric": "success_rate", "submission_time": "2025-11-24T23:38:02.231319", "tags": ["swt-bench"], "total_cost": 42.7, "total_runtime": 627.0}
2
- {"agent_name": "1.0.1", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 62.3, "metric": "success_rate", "submission_time": "2025-11-24T23:38:02.231344", "tags": ["swt-bench"], "total_cost": 41.15, "total_runtime": 611.5}
3
- {"agent_name": "1.0.0", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 54.1, "metric": "success_rate", "submission_time": "2025-11-24T23:38:02.231356", "tags": ["swt-bench"], "total_cost": 37.05, "total_runtime": 570.5}
4
- {"agent_name": "0.9.5", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 47.8, "metric": "success_rate", "submission_time": "2025-11-24T23:38:02.231365", "tags": ["swt-bench"], "total_cost": 33.9, "total_runtime": 539.0}
5
- {"agent_name": "0.9.0", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 44.2, "metric": "success_rate", "submission_time": "2025-11-24T23:38:02.231373", "tags": ["swt-bench"], "total_cost": 32.1, "total_runtime": 521.0}
 
1
+ {"agent_name": "1.0.2", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed", "score": 65.4, "metric": "success_rate", "submission_time": "2025-11-25T00:12:22.972724", "tags": ["swt-bench"], "total_cost": 42.7, "total_runtime": 627.0}
2
+ {"agent_name": "1.0.1", "llm_base": "gpt-4o-2024-11-20", "openness": "closed", "score": 62.3, "metric": "success_rate", "submission_time": "2025-11-25T00:12:22.972741", "tags": ["swt-bench"], "total_cost": 41.15, "total_runtime": 611.5}
3
+ {"agent_name": "1.0.0", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed", "score": 54.1, "metric": "success_rate", "submission_time": "2025-11-25T00:12:22.972750", "tags": ["swt-bench"], "total_cost": 37.05, "total_runtime": 570.5}
4
+ {"agent_name": "0.9.5", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed", "score": 47.8, "metric": "success_rate", "submission_time": "2025-11-25T00:12:22.972758", "tags": ["swt-bench"], "total_cost": 33.9, "total_runtime": 539.0}
5
+ {"agent_name": "0.9.0", "llm_base": "claude-3-opus-20240229", "openness": "closed", "score": 44.2, "metric": "success_rate", "submission_time": "2025-11-25T00:12:22.972765", "tags": ["swt-bench"], "total_cost": 32.1, "total_runtime": 521.0}
simple_data_loader.py CHANGED
@@ -88,7 +88,6 @@ class SimpleLeaderboardViewer:
88
  'agent_name': metadata.get('agent_name', 'Unknown'),
89
  'llm_base': metadata.get('model', 'unknown'),
90
  'openness': metadata.get('openness', 'unknown'),
91
- 'tool_usage': metadata.get('tool_usage', 'standard'),
92
  'submission_time': metadata.get('submission_time', ''),
93
  'score': score_entry.get('score'),
94
  'metric': score_entry.get('metric', 'unknown'),
@@ -152,12 +151,17 @@ class SimpleLeaderboardViewer:
152
 
153
  # Build a single record for this agent
154
  first_record = agent_records.iloc[0]
 
 
 
 
 
 
155
  record = {
156
  # Core agent info - use final display names
157
  'agent': agent_name, # Will become "Agent Version" after prettifying
158
  'models used': first_record['llm_base'], # Will become "Model"
159
- 'openness': first_record['openness'], # Will become "Openness"
160
- 'agent tooling': first_record['tool_usage'], # Will become "Agent Tooling"
161
  'date': first_record['submission_time'], # Will become "Date"
162
  # Additional columns expected by the transformer
163
  'id': first_record.get('id', agent_name), # Will become "Id"
 
88
  'agent_name': metadata.get('agent_name', 'Unknown'),
89
  'llm_base': metadata.get('model', 'unknown'),
90
  'openness': metadata.get('openness', 'unknown'),
 
91
  'submission_time': metadata.get('submission_time', ''),
92
  'score': score_entry.get('score'),
93
  'metric': score_entry.get('metric', 'unknown'),
 
151
 
152
  # Build a single record for this agent
153
  first_record = agent_records.iloc[0]
154
+
155
+ # Normalize openness to "open" or "closed"
156
+ from aliases import OPENNESS_MAPPING
157
+ raw_openness = first_record['openness']
158
+ normalized_openness = OPENNESS_MAPPING.get(raw_openness, raw_openness)
159
+
160
  record = {
161
  # Core agent info - use final display names
162
  'agent': agent_name, # Will become "Agent Version" after prettifying
163
  'models used': first_record['llm_base'], # Will become "Model"
164
+ 'openness': normalized_openness, # Will become "Openness" (simplified to "open" or "closed")
 
165
  'date': first_record['submission_time'], # Will become "Date"
166
  # Additional columns expected by the transformer
167
  'id': first_record.get('id', agent_name), # Will become "Id"
ui_components.py CHANGED
@@ -36,73 +36,27 @@ from content import (
36
 
37
  api = HfApi()
38
  os.makedirs(EXTRACTED_DATA_DIR, exist_ok=True)
39
- # Global variables
40
- COMBINED_ICON_MAP = {
41
- aliases.CANONICAL_OPENNESS_OPEN_SOURCE_OPEN_WEIGHTS: {
42
- aliases.CANONICAL_TOOL_USAGE_STANDARD: "assets/os-ow-standard.svg",
43
- aliases.CANONICAL_TOOL_USAGE_CUSTOM_INTERFACE: "assets/os-ow-equivalent.svg",
44
- aliases.CANONICAL_TOOL_USAGE_FULLY_CUSTOM: "assets/os-ow-custom.svg",
45
- },
46
- aliases.CANONICAL_OPENNESS_OPEN_SOURCE_CLOSED_WEIGHTS: {
47
- aliases.CANONICAL_TOOL_USAGE_STANDARD: "assets/os-standard.svg",
48
- aliases.CANONICAL_TOOL_USAGE_CUSTOM_INTERFACE: "assets/os-equivalent.svg",
49
- aliases.CANONICAL_TOOL_USAGE_FULLY_CUSTOM: "assets/os-custom.svg",
50
- },
51
- aliases.CANONICAL_OPENNESS_CLOSED_API_AVAILABLE: {
52
- aliases.CANONICAL_TOOL_USAGE_STANDARD: "assets/api-standard.svg",
53
- aliases.CANONICAL_TOOL_USAGE_CUSTOM_INTERFACE: "assets/api-equivalent.svg",
54
- aliases.CANONICAL_TOOL_USAGE_FULLY_CUSTOM: "assets/api-custom.svg",
55
- },
56
- aliases.CANONICAL_OPENNESS_CLOSED_UI_ONLY: {
57
- aliases.CANONICAL_TOOL_USAGE_STANDARD: "assets/c-standard.svg",
58
- aliases.CANONICAL_TOOL_USAGE_CUSTOM_INTERFACE: "assets/c-equivalent.svg",
59
- aliases.CANONICAL_TOOL_USAGE_FULLY_CUSTOM: "assets/c-custom.svg",
60
- }
61
  }
62
 
63
-
64
- # it's important to do the tool usage first here, so that when
65
- # we do openness, the tool usage changes get picked up
66
- for openness in COMBINED_ICON_MAP:
67
- for canonical_tool_usage, tool_usage_aliases in aliases.TOOL_USAGE_ALIASES.items():
68
- for tool_usage_alias in tool_usage_aliases:
69
- COMBINED_ICON_MAP[openness][tool_usage_alias] = COMBINED_ICON_MAP[openness][canonical_tool_usage]
70
-
71
  for canonical_openness, openness_aliases in aliases.OPENNESS_ALIASES.items():
72
  for openness_alias in openness_aliases:
73
- COMBINED_ICON_MAP[openness_alias] = COMBINED_ICON_MAP[canonical_openness]
74
 
75
 
76
  OPENNESS_SVG_MAP = {
77
- aliases.CANONICAL_OPENNESS_OPEN_SOURCE_OPEN_WEIGHTS: {
78
  "path": "assets/ellipse-pink.svg",
79
- "description": "Code and models are open"
80
- },
81
- aliases.CANONICAL_OPENNESS_OPEN_SOURCE_CLOSED_WEIGHTS: {
82
- "path": "assets/ellipse-coral.svg",
83
- "description": "Code is open but uses closed-weight models"
84
  },
85
- aliases.CANONICAL_OPENNESS_CLOSED_API_AVAILABLE: {
86
  "path": "assets/ellipse-yellow.svg",
87
- "description": "No access to code; API access only"
88
- },
89
- aliases.CANONICAL_OPENNESS_CLOSED_UI_ONLY: {
90
- "path": "assets/ellipse-white.svg",
91
- "description": "No access to code or API; UI access only"
92
- },
93
- }
94
- TOOLING_SVG_MAP = {
95
- aliases.CANONICAL_TOOL_USAGE_STANDARD: {
96
- "path": "assets/five-point-star.svg",
97
- "description": "Uses only tools explicitly provided in state.tools"
98
- },
99
- aliases.CANONICAL_TOOL_USAGE_CUSTOM_INTERFACE: {
100
- "path": "assets/four-point-star.svg",
101
- "description": "Custom tools for accessing an equivalent underlying environment"
102
- },
103
- aliases.CANONICAL_TOOL_USAGE_FULLY_CUSTOM: {
104
- "path": "assets/three-point-star.svg",
105
- "description": f"Uses tools beyond constraints of {aliases.CANONICAL_TOOL_USAGE_STANDARD} or {aliases.CANONICAL_TOOL_USAGE_CUSTOM_INTERFACE}"
106
  },
107
  }
108
 
@@ -116,29 +70,6 @@ def get_svg_as_data_uri(path: str) -> str:
116
  print(f"Warning: SVG file not found at {path}")
117
  return ""
118
 
119
- # Create a pre-loaded version of our map. This should be run ONCE when the app starts.
120
- PRELOADED_URI_MAP = {
121
- openness: {
122
- tooling: get_svg_as_data_uri(path)
123
- for tooling, path in tooling_map.items()
124
- }
125
- for openness, tooling_map in COMBINED_ICON_MAP.items()
126
- }
127
-
128
- def get_combined_icon_html(row, uri_map):
129
- """
130
- Looks up the correct icon URI from the pre-loaded map based on the row's
131
- 'Openness' and 'Agent Tooling' values and returns an HTML <img> tag.
132
- """
133
- openness_val = row['Openness']
134
- tooling_val = row['Agent Tooling']
135
- uri = uri_map.get(openness_val, {}).get(tooling_val, "")
136
- # The tooltip will show the exact combination for clarity.
137
- tooltip = f"Openness: {openness_val}, Tooling: {tooling_val}"
138
-
139
- # Return the HTML string that Gradio will render in the DataFrame.
140
- return f'<img src="{uri}" alt="{tooltip}" title="{tooltip}" style="width:24px; height:24px;">'
141
-
142
  def create_svg_html(value, svg_map):
143
  """
144
  Generates the absolute simplest HTML for an icon, without any extra text.
@@ -162,18 +93,12 @@ def create_svg_html(value, svg_map):
162
 
163
  def build_openness_tooltip_content() -> str:
164
  """
165
- Generates the inner HTML for the Agent Openness tooltip card,
166
  """
167
- descriptions = {
168
- aliases.CANONICAL_OPENNESS_OPEN_SOURCE_OPEN_WEIGHTS: "Both code and ML models are open",
169
- aliases.CANONICAL_OPENNESS_OPEN_SOURCE_CLOSED_WEIGHTS: "Code is open but uses an ML model with closed-weights",
170
- aliases.CANONICAL_OPENNESS_CLOSED_API_AVAILABLE: "No access to code; API access only",
171
- aliases.CANONICAL_OPENNESS_CLOSED_UI_ONLY: "No access to code or API; UI access only",
172
- }
173
  html_items = []
174
  for name, info in OPENNESS_SVG_MAP.items():
175
  uri = get_svg_as_data_uri(info["path"])
176
- desc = descriptions.get(name, "")
177
 
178
  html_items.append(f"""
179
  <div class="tooltip-legend-item">
@@ -190,8 +115,8 @@ def build_openness_tooltip_content() -> str:
190
  return f"""<span class="tooltip-icon-legend">
191
 
192
  <span class="tooltip-card">
193
- <h3>Agent Openness</h3>
194
- <p class="tooltip-description">Indicates how transparent and reproducible an agent is.</p>
195
  <div class="tooltip-items-container">{joined_items}</div>
196
  </span>
197
  </span>"""
@@ -215,48 +140,7 @@ def build_pareto_tooltip_content() -> str:
215
  </div>
216
  """
217
 
218
- def build_tooling_tooltip_content() -> str:
219
- """Generates the inner HTML for the Agent Tooling tooltip card."""
220
- descriptions = {
221
- aliases.CANONICAL_TOOL_USAGE_STANDARD: "Uses only predefined tools from the evaluation environment (as defined in Inspect's state.tools).",
222
- aliases.CANONICAL_TOOL_USAGE_CUSTOM_INTERFACE: "Custom tools for accessing an equivalent underlying environment:",
223
- aliases.CANONICAL_TOOL_USAGE_FULLY_CUSTOM: f"Uses tools beyond constraints of {aliases.CANONICAL_TOOL_USAGE_STANDARD} or {aliases.CANONICAL_TOOL_USAGE_CUSTOM_INTERFACE}",
224
- }
225
- custom_interface_sub_list = """
226
- <ul class="tooltip-sub-list">
227
- <li>Literature tasks: Information access is limited to date-restricted usage of the Asta MCP tools.</li>
228
- <li>Code tasks: Code execution is limited to an iPython shell in a machine environment initialized with the standard Asta sandbox Dockerfile (or equivalent).</li>
229
- </ul>
230
- """
231
- html_items = []
232
- for name, info in TOOLING_SVG_MAP.items():
233
- uri = get_svg_as_data_uri(info["path"])
234
- desc = descriptions.get(name, "")
235
-
236
- # Check if this is the special case that needs a sub-list
237
- sub_list_html = custom_interface_sub_list if name == aliases.CANONICAL_TOOL_USAGE_CUSTOM_INTERFACE else ""
238
-
239
- html_items.append(f"""
240
- <div class="tooltip-legend-item">
241
- <img src="{uri}" alt="{name}">
242
- <div>
243
- <strong>{name}</strong>
244
- <span>{desc}</span>
245
- {sub_list_html}
246
- </div>
247
- </div>
248
- """)
249
-
250
- joined_items = "".join(html_items)
251
 
252
- return f"""<span class="tooltip-icon-legend">
253
-
254
- <span class="tooltip-card">
255
- <h3>Agent Tooling</h3>
256
- <p class="tooltip-description">Describes the tool usage and execution environment of the agent during evaluation.</p>
257
- <div class="tooltip-items-container">{joined_items}</div>
258
- </span>
259
- </span>"""
260
 
261
 
262
  def build_descriptions_tooltip_content(table) -> str:
@@ -303,9 +187,6 @@ def build_descriptions_tooltip_content(table) -> str:
303
  <div class="tooltip-description-item"><b>Logs:</b> View evaluation run logs (e.g., outputs, traces).</div>
304
  """
305
 
306
- # Dynamically generate the correct HTML for the legend parts
307
- openness_html = " ".join([create_svg_html(name, OPENNESS_SVG_MAP) for name in OPENNESS_SVG_MAP])
308
- tooling_html = " ".join([create_svg_html(name, TOOLING_SVG_MAP) for name in TOOLING_SVG_MAP])
309
  # Create HTML for the "Openness" legend items for table
310
  openness_html_items = []
311
  for name, info in OPENNESS_SVG_MAP.items():
@@ -319,21 +200,8 @@ for name, info in OPENNESS_SVG_MAP.items():
319
  )
320
  openness_html = " ".join(openness_html_items)
321
 
322
- # Create HTML for the "Tooling" legend items for table
323
- tooling_html_items = []
324
- for name, info in TOOLING_SVG_MAP.items():
325
- uri = get_svg_as_data_uri(info["path"])
326
- tooling_html_items.append(
327
- f'<div style="display: flex; align-items: center; white-space: nowrap;">'
328
- f'<img src="{uri}" alt="{name}" title="{name}" style="width:16px; height:16px; margin-right: 4px; flex-shrink: 0;">'
329
- f'<span>{name}</span>'
330
- f'</div>'
331
- )
332
- tooling_html = " ".join(tooling_html_items)
333
-
334
  pareto_tooltip_content = build_pareto_tooltip_content()
335
  openness_tooltip_content = build_openness_tooltip_content()
336
- tooling_tooltip_content = build_tooling_tooltip_content()
337
 
338
  def create_legend_markdown(which_table: str) -> str:
339
  """
@@ -358,16 +226,10 @@ def create_legend_markdown(which_table: str) -> str:
358
  </div>
359
 
360
  <div> <!-- Container for the Openness section -->
361
- <b>Agent Openness</b>
362
  {openness_tooltip_content}
363
  <div class="table-legend-item">{openness_html}</div>
364
  </div>
365
-
366
- <div> <!-- Container for the Tooling section -->
367
- <b>Agent Tooling</b>
368
- {tooling_tooltip_content}
369
- <div class="table-legend-item">{tooling_html}</div>
370
- </div>
371
 
372
  <div><!-- Container for the Column Descriptions section -->
373
  <b>Column Descriptions</b>
@@ -400,22 +262,6 @@ for name, info in OPENNESS_SVG_MAP.items():
400
  f'</div>'
401
  )
402
 
403
- tooling_legend_items = []
404
- for name, info in TOOLING_SVG_MAP.items():
405
- uri = get_svg_as_data_uri(info["path"])
406
- if uri:
407
- tooling_legend_items.append(
408
- f'<div class="plot-legend-item">'
409
- f'<img class="plot-legend-item-svg plot-legend-tooling-svg" src="{uri}" alt="{name}" title="{name}">'
410
- f'<div class="plot-legend-item-text">'
411
- f'<div>'
412
- f'<span>{name}</span>'
413
- f'</div>'
414
- f'<span class="description">{info["description"]}</span>'
415
- f'</div>'
416
- f'</div>'
417
- )
418
-
419
  plot_legend_html = f"""
420
  <div class="plot-legend-container">
421
  <div id="plot-legend-logo">
@@ -430,16 +276,10 @@ plot_legend_html = f"""
430
  </div>
431
  </div>
432
  </div>
433
- <div style="margin-bottom: 16px;">
434
- <span class="plot-legend-category-heading">Agent Openness</span>
435
- <div style="margin-top: 8px;">
436
- {''.join(openness_legend_items)}
437
- </div>
438
- </div>
439
  <div>
440
- <span class="plot-legend-category-heading">Agent Tooling</span>
441
  <div style="margin-top: 8px;">
442
- {''.join(tooling_legend_items)}
443
  </div>
444
  </div>
445
  </div>
@@ -666,7 +506,7 @@ def create_benchmark_details_display(
666
  benchmark_cost_col = f"{benchmark_name} Cost"
667
 
668
  # Define the columns needed for the detailed table
669
- table_cols = ['Agent','Source','Openness','Agent Tooling', 'Submitter', 'Date', benchmark_score_col, benchmark_cost_col,'Logs','id', 'Models Used']
670
 
671
  # Filter to only columns that actually exist in the full dataframe
672
  existing_table_cols = [col for col in table_cols if col in full_df.columns]
@@ -690,10 +530,13 @@ def create_benchmark_details_display(
690
  axis=1
691
  )
692
 
693
- benchmark_table_df['Icon'] = benchmark_table_df.apply(
694
- lambda row: get_combined_icon_html(row, PRELOADED_URI_MAP),
695
- axis=1 # IMPORTANT: axis=1 tells pandas to process row-by-row
696
- )
 
 
 
697
 
698
  #Make pretty and format the Models Used column
699
  benchmark_table_df['Models Used'] = benchmark_table_df['Models Used'].apply(clean_llm_base_list)
 
36
 
37
  api = HfApi()
38
  os.makedirs(EXTRACTED_DATA_DIR, exist_ok=True)
39
+ # Simplified icon map (no tooling distinction, only openness)
40
+ # Not actually used since we removed icons from the table, but keeping for potential future use
41
+ OPENNESS_ICON_MAP = {
42
+ aliases.CANONICAL_OPENNESS_OPEN: "assets/ellipse-pink.svg",
43
+ aliases.CANONICAL_OPENNESS_CLOSED: "assets/ellipse-yellow.svg",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  }
45
 
46
+ # Add aliases
 
 
 
 
 
 
 
47
  for canonical_openness, openness_aliases in aliases.OPENNESS_ALIASES.items():
48
  for openness_alias in openness_aliases:
49
+ OPENNESS_ICON_MAP[openness_alias] = OPENNESS_ICON_MAP[canonical_openness]
50
 
51
 
52
  OPENNESS_SVG_MAP = {
53
+ aliases.CANONICAL_OPENNESS_OPEN: {
54
  "path": "assets/ellipse-pink.svg",
55
+ "description": "Open source model"
 
 
 
 
56
  },
57
+ aliases.CANONICAL_OPENNESS_CLOSED: {
58
  "path": "assets/ellipse-yellow.svg",
59
+ "description": "Closed source model"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  },
61
  }
62
 
 
70
  print(f"Warning: SVG file not found at {path}")
71
  return ""
72
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  def create_svg_html(value, svg_map):
74
  """
75
  Generates the absolute simplest HTML for an icon, without any extra text.
 
93
 
94
  def build_openness_tooltip_content() -> str:
95
  """
96
+ Generates the inner HTML for the Model Openness tooltip card.
97
  """
 
 
 
 
 
 
98
  html_items = []
99
  for name, info in OPENNESS_SVG_MAP.items():
100
  uri = get_svg_as_data_uri(info["path"])
101
+ desc = info["description"]
102
 
103
  html_items.append(f"""
104
  <div class="tooltip-legend-item">
 
115
  return f"""<span class="tooltip-icon-legend">
116
 
117
  <span class="tooltip-card">
118
+ <h3>Model Openness</h3>
119
+ <p class="tooltip-description">Indicates whether the language model is open source or closed source.</p>
120
  <div class="tooltip-items-container">{joined_items}</div>
121
  </span>
122
  </span>"""
 
140
  </div>
141
  """
142
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143
 
 
 
 
 
 
 
 
 
144
 
145
 
146
  def build_descriptions_tooltip_content(table) -> str:
 
187
  <div class="tooltip-description-item"><b>Logs:</b> View evaluation run logs (e.g., outputs, traces).</div>
188
  """
189
 
 
 
 
190
  # Create HTML for the "Openness" legend items for table
191
  openness_html_items = []
192
  for name, info in OPENNESS_SVG_MAP.items():
 
200
  )
201
  openness_html = " ".join(openness_html_items)
202
 
 
 
 
 
 
 
 
 
 
 
 
 
203
  pareto_tooltip_content = build_pareto_tooltip_content()
204
  openness_tooltip_content = build_openness_tooltip_content()
 
205
 
206
  def create_legend_markdown(which_table: str) -> str:
207
  """
 
226
  </div>
227
 
228
  <div> <!-- Container for the Openness section -->
229
+ <b>Model Openness</b>
230
  {openness_tooltip_content}
231
  <div class="table-legend-item">{openness_html}</div>
232
  </div>
 
 
 
 
 
 
233
 
234
  <div><!-- Container for the Column Descriptions section -->
235
  <b>Column Descriptions</b>
 
262
  f'</div>'
263
  )
264
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
265
  plot_legend_html = f"""
266
  <div class="plot-legend-container">
267
  <div id="plot-legend-logo">
 
276
  </div>
277
  </div>
278
  </div>
 
 
 
 
 
 
279
  <div>
280
+ <span class="plot-legend-category-heading">Model Openness</span>
281
  <div style="margin-top: 8px;">
282
+ {''.join(openness_legend_items)}
283
  </div>
284
  </div>
285
  </div>
 
506
  benchmark_cost_col = f"{benchmark_name} Cost"
507
 
508
  # Define the columns needed for the detailed table
509
+ table_cols = ['Agent','Source','Openness', 'Date', benchmark_score_col, benchmark_cost_col,'Logs','id', 'Models Used']
510
 
511
  # Filter to only columns that actually exist in the full dataframe
512
  existing_table_cols = [col for col in table_cols if col in full_df.columns]
 
530
  axis=1
531
  )
532
 
533
+ # Create simple openness icons
534
+ def get_openness_icon_html(row):
535
+ openness_val = row.get('Openness', '')
536
+ uri = get_svg_as_data_uri(OPENNESS_ICON_MAP.get(openness_val, "assets/ellipse-pink.svg"))
537
+ return f'<img src="{uri}" alt="{openness_val}" title="{openness_val}" style="width:24px; height:24px;">'
538
+
539
+ benchmark_table_df['Icon'] = benchmark_table_df.apply(get_openness_icon_html, axis=1)
540
 
541
  #Make pretty and format the Models Used column
542
  benchmark_table_df['Models Used'] = benchmark_table_df['Models Used'].apply(clean_llm_base_list)