Spaces:

OpenHands
/

openhands-index

Running

openhands commited on 19 days ago

Commit

ca754bb

1 Parent(s): 701c496

Simplify leaderboard to open vs closed models only

Remove agent tooling distinction from the leaderboard:
- Simplified openness from 4 categories to 2: 'open' and 'closed'
- Removed all 'tool_usage' references from data structures
- Updated UI to remove Agent Tooling column and icons
- Simplified scatter plots to use only 2 colors (open/closed)
- Updated legends to show Model Openness instead of Agent Openness/Tooling
- Regenerated mock data without tool_usage field
- Added openness normalization to handle legacy values

Files modified:
- aliases.py: Simplified openness constants, removed tool usage
- simple_data_loader.py: Added openness normalization, removed tool_usage loading
- leaderboard_transformer.py: Removed Agent Tooling column, simplified color mapping
- ui_components.py: Removed tooling icons/tooltips, simplified to openness only
- generate_mock_jsonl.py: Removed tool_usage from mock data generation

Files changed (11) hide show

aliases.py +14 -18
generate_mock_jsonl.py +5 -11
leaderboard_transformer.py +13 -28
mock_results/1.0.0-dev1/commit0.jsonl +5 -5
mock_results/1.0.0-dev1/gaia.jsonl +5 -5
mock_results/1.0.0-dev1/multi-swe-bench.jsonl +5 -5
mock_results/1.0.0-dev1/swe-bench-multimodal.jsonl +5 -5
mock_results/1.0.0-dev1/swe-bench.jsonl +5 -5
mock_results/1.0.0-dev1/swt-bench.jsonl +5 -5
simple_data_loader.py +7 -3
ui_components.py +26 -183

aliases.py CHANGED Viewed

@@ -1,22 +1,18 @@
-# Define constants that were previously imported from agenteval
-CANONICAL_OPENNESS_OPEN_SOURCE_OPEN_WEIGHTS = "open_source_open_weights"
-CANONICAL_OPENNESS_OPEN_SOURCE_CLOSED_WEIGHTS = "open_source_closed_weights"
-CANONICAL_OPENNESS_CLOSED_API_AVAILABLE = "closed_api_available"
-CANONICAL_OPENNESS_CLOSED_UI_ONLY = "closed_ui_only"
-CANONICAL_TOOL_USAGE_STANDARD = "standard"
-CANONICAL_TOOL_USAGE_CUSTOM_INTERFACE = "custom_interface"
-CANONICAL_TOOL_USAGE_FULLY_CUSTOM = "fully_custom"
-OPENNESS_ALIASES = {
-    CANONICAL_OPENNESS_OPEN_SOURCE_OPEN_WEIGHTS: {"Open Source + Open Weights"},
-    CANONICAL_OPENNESS_OPEN_SOURCE_CLOSED_WEIGHTS: {"Open Source"},
-    CANONICAL_OPENNESS_CLOSED_API_AVAILABLE: {"API Available"},
-    CANONICAL_OPENNESS_CLOSED_UI_ONLY: {"Closed"}
 }
-TOOL_USAGE_ALIASES = {
-    CANONICAL_TOOL_USAGE_STANDARD: {},
-    CANONICAL_TOOL_USAGE_CUSTOM_INTERFACE: {"Custom with Standard Search"},
-    CANONICAL_TOOL_USAGE_FULLY_CUSTOM: {"Fully Custom"}
 }

+# Define constants for openness categories
+CANONICAL_OPENNESS_OPEN = "open"
+CANONICAL_OPENNESS_CLOSED = "closed"
+# Map old openness values to new simplified values
+OPENNESS_MAPPING = {
+    "open_source_open_weights": CANONICAL_OPENNESS_OPEN,
+    "open_source_closed_weights": CANONICAL_OPENNESS_OPEN,
+    "closed_api_available": CANONICAL_OPENNESS_CLOSED,
+    "closed_ui_only": CANONICAL_OPENNESS_CLOSED,
+    "open": CANONICAL_OPENNESS_OPEN,
+    "closed": CANONICAL_OPENNESS_CLOSED,
 }
+OPENNESS_ALIASES = {
+    CANONICAL_OPENNESS_OPEN: {"Open", "Open Source", "Open Source + Open Weights"},
+    CANONICAL_OPENNESS_CLOSED: {"Closed", "API Available"}
 }

generate_mock_jsonl.py CHANGED Viewed

@@ -43,8 +43,7 @@ MOCK_AGENTS = [
     {
         "agent_name": "1.0.2",
         "llm_base": "claude-3-5-sonnet-20241022",
-        "openness": "closed_api_available",
-        "tool_usage": "standard",
         "scores": {
             "swe-bench": 48.3,
             "multi-swe-bench": 35.2,
@@ -57,8 +56,7 @@ MOCK_AGENTS = [
     {
         "agent_name": "1.0.1",
         "llm_base": "gpt-4o-2024-11-20",
-        "openness": "closed_api_available",
-        "tool_usage": "standard",
         "scores": {
             "swe-bench": 45.1,
             "multi-swe-bench": 32.8,
@@ -71,8 +69,7 @@ MOCK_AGENTS = [
     {
         "agent_name": "1.0.0",
         "llm_base": "gpt-4-turbo-2024-04-09",
-        "openness": "closed_api_available",
-        "tool_usage": "standard",
         "scores": {
             "swe-bench": 38.7,
             "multi-swe-bench": 28.4,
@@ -85,8 +82,7 @@ MOCK_AGENTS = [
     {
         "agent_name": "0.9.5",
         "llm_base": "gpt-4o-mini-2024-07-18",
-        "openness": "closed_api_available",
-        "tool_usage": "standard",
         "scores": {
             "swe-bench": 32.5,
             "multi-swe-bench": 24.1,
@@ -99,8 +95,7 @@ MOCK_AGENTS = [
     {
         "agent_name": "0.9.0",
         "llm_base": "claude-3-opus-20240229",
-        "openness": "closed_api_available",
-        "tool_usage": "custom_interface",
         "scores": {
             "swe-bench": 29.8,
             "multi-swe-bench": 21.5,
@@ -148,7 +143,6 @@ def generate_mock_data():
                     "agent_name": agent["agent_name"],
                     "llm_base": agent["llm_base"],
                     "openness": agent["openness"],
-                    "tool_usage": agent["tool_usage"],
                     "score": agent["scores"][benchmark_name],
                     "metric": benchmark_info["metric"],
                     "submission_time": datetime.now().isoformat(),

     {
         "agent_name": "1.0.2",
         "llm_base": "claude-3-5-sonnet-20241022",
+        "openness": "closed",
         "scores": {
             "swe-bench": 48.3,
             "multi-swe-bench": 35.2,
     {
         "agent_name": "1.0.1",
         "llm_base": "gpt-4o-2024-11-20",
+        "openness": "closed",
         "scores": {
             "swe-bench": 45.1,
             "multi-swe-bench": 32.8,
     {
         "agent_name": "1.0.0",
         "llm_base": "gpt-4-turbo-2024-04-09",
+        "openness": "closed",
         "scores": {
             "swe-bench": 38.7,
             "multi-swe-bench": 28.4,
     {
         "agent_name": "0.9.5",
         "llm_base": "gpt-4o-mini-2024-07-18",
+        "openness": "closed",
         "scores": {
             "swe-bench": 32.5,
             "multi-swe-bench": 24.1,
     {
         "agent_name": "0.9.0",
         "llm_base": "claude-3-opus-20240229",
+        "openness": "closed",
         "scores": {
             "swe-bench": 29.8,
             "multi-swe-bench": 21.5,
                     "agent_name": agent["agent_name"],
                     "llm_base": agent["llm_base"],
                     "openness": agent["openness"],
                     "score": agent["scores"][benchmark_name],
                     "metric": benchmark_info["metric"],
                     "submission_time": datetime.now().isoformat(),

leaderboard_transformer.py CHANGED Viewed

@@ -110,7 +110,6 @@ def _pretty_column_name(raw_col: str) -> str:
         'Overall cost': 'Overall Cost',
         'Logs': 'Logs',
         'Openness': 'Openness',
-        'Agent tooling': 'Agent Tooling',
         'LLM base': 'Model',
         'Source': 'Source',
     }
@@ -255,9 +254,9 @@ class DataTransformer:
         df_view = df_sorted.copy()
-        # --- 3. Add Columns for Agent Openness and Tooling ---
         base_cols = ["id","Agent","Submitter","Models Used","Source"]
-        new_cols = ["Openness", "Agent Tooling"]
         ending_cols = ["Date", "Logs"]
         metrics_to_display = [primary_score_col, f"{primary_metric} Cost"]
@@ -331,13 +330,10 @@ def _plot_scatter_plotly(
 ) -> go.Figure:
     # --- Section 1: Define Mappings ---
-    # These include aliases for openness categories,
-    # so multiple names might correspond to the same color.
     color_map = {
-        aliases.CANONICAL_OPENNESS_OPEN_SOURCE_OPEN_WEIGHTS: "deeppink",
-        aliases.CANONICAL_OPENNESS_OPEN_SOURCE_CLOSED_WEIGHTS: "coral",
-        aliases.CANONICAL_OPENNESS_CLOSED_API_AVAILABLE: "yellow",
-        aliases.CANONICAL_OPENNESS_CLOSED_UI_ONLY: "white",
     }
     for canonical_openness, openness_aliases in aliases.OPENNESS_ALIASES.items():
         for openness_alias in openness_aliases:
@@ -346,26 +342,15 @@ def _plot_scatter_plotly(
     colors_for_legend = set(aliases.OPENNESS_ALIASES.keys())
     category_order = list(color_map.keys())
-    # These include aliases for tool usage categories,
-    # so multiple names might correspond to the same shape.
-    shape_map = {
-        aliases.CANONICAL_TOOL_USAGE_STANDARD: "star",
-        aliases.CANONICAL_TOOL_USAGE_CUSTOM_INTERFACE: "star-diamond",
-        aliases.CANONICAL_TOOL_USAGE_FULLY_CUSTOM: "star-triangle-up",
-    }
-    for canonical_tool_usage, tool_usages_aliases in aliases.TOOL_USAGE_ALIASES.items():
-        for tool_usage_alias in tool_usages_aliases:
-            shape_map[tool_usage_alias] = shape_map[canonical_tool_usage]
-    default_shape = 'square'
-    # Only keep one name per shape for the legend.
-    shapes_for_legend = set(aliases.TOOL_USAGE_ALIASES.keys())
     x_col_to_use = x
     y_col_to_use = y
     llm_base = data["Models Used"] if "Models Used" in data.columns else "Models Used"
     # --- Section 2: Data Preparation---
-    required_cols = [y_col_to_use, agent_col, "Openness", "Agent Tooling"]
     if not all(col in data.columns for col in required_cols):
         logger.error(f"Missing one or more required columns for plotting: {required_cols}")
         return go.Figure()
@@ -411,7 +396,7 @@ def _plot_scatter_plotly(
         data_plot[x_col_to_use] = 0
     # Clean data based on all necessary columns
-    data_plot.dropna(subset=[y_col_to_use, x_col_to_use, "Openness", "Agent Tooling"], inplace=True)
     # --- Section 3: Initialize Figure ---
     fig = go.Figure()
@@ -458,8 +443,7 @@ def _plot_scatter_plotly(
             parts.append(f"{h_pad}{x_axis_label}: <b>Missing</b>{h_pad}<br>")
         else:
             parts.append(f"{h_pad}{x_axis_label}: <b>${row[x_col]:.2f}</b>{h_pad}<br>")
-        parts.append(f"{h_pad}Openness: <b>{row['Openness']}</b>{h_pad}<br>")
-        parts.append(f"{h_pad}Tooling: <b>{row['Agent Tooling']}</b>{h_pad}")
         # Add extra vertical space (line spacing) before the next section
         parts.append("<br>")
@@ -491,7 +475,8 @@ def _plot_scatter_plotly(
         ),
         axis=1
     )
-    data_plot['shape_symbol'] = data_plot['Agent Tooling'].map(shape_map).fillna(default_shape)
     # --- Section 6: Plot Markers by "Openness" Category ---
     for category in category_order:
@@ -509,7 +494,7 @@ def _plot_scatter_plotly(
             hoverinfo='text',
             marker=dict(
                 color=color_map.get(category, 'black'),
-                symbol=group['shape_symbol'],
                 size=15,
                 opacity=0.8,
                 line=dict(width=1, color='deeppink')

         'Overall cost': 'Overall Cost',
         'Logs': 'Logs',
         'Openness': 'Openness',
         'LLM base': 'Model',
         'Source': 'Source',
     }
         df_view = df_sorted.copy()
+        # --- 3. Add Columns for Agent Openness ---
         base_cols = ["id","Agent","Submitter","Models Used","Source"]
+        new_cols = ["Openness"]
         ending_cols = ["Date", "Logs"]
         metrics_to_display = [primary_score_col, f"{primary_metric} Cost"]
 ) -> go.Figure:
     # --- Section 1: Define Mappings ---
+    # Map openness to colors (simplified: open vs closed)
     color_map = {
+        aliases.CANONICAL_OPENNESS_OPEN: "deeppink",
+        aliases.CANONICAL_OPENNESS_CLOSED: "yellow",
     }
     for canonical_openness, openness_aliases in aliases.OPENNESS_ALIASES.items():
         for openness_alias in openness_aliases:
     colors_for_legend = set(aliases.OPENNESS_ALIASES.keys())
     category_order = list(color_map.keys())
+    # Use consistent marker shape (no tooling distinction)
+    default_shape = 'circle'
     x_col_to_use = x
     y_col_to_use = y
     llm_base = data["Models Used"] if "Models Used" in data.columns else "Models Used"
     # --- Section 2: Data Preparation---
+    required_cols = [y_col_to_use, agent_col, "Openness"]
     if not all(col in data.columns for col in required_cols):
         logger.error(f"Missing one or more required columns for plotting: {required_cols}")
         return go.Figure()
         data_plot[x_col_to_use] = 0
     # Clean data based on all necessary columns
+    data_plot.dropna(subset=[y_col_to_use, x_col_to_use, "Openness"], inplace=True)
     # --- Section 3: Initialize Figure ---
     fig = go.Figure()
             parts.append(f"{h_pad}{x_axis_label}: <b>Missing</b>{h_pad}<br>")
         else:
             parts.append(f"{h_pad}{x_axis_label}: <b>${row[x_col]:.2f}</b>{h_pad}<br>")
+        parts.append(f"{h_pad}Openness: <b>{row['Openness']}</b>{h_pad}")
         # Add extra vertical space (line spacing) before the next section
         parts.append("<br>")
         ),
         axis=1
     )
+    # Use consistent shape for all points (no tooling distinction)
+    data_plot['shape_symbol'] = default_shape
     # --- Section 6: Plot Markers by "Openness" Category ---
     for category in category_order:
             hoverinfo='text',
             marker=dict(
                 color=color_map.get(category, 'black'),
+                symbol=default_shape,
                 size=15,
                 opacity=0.8,
                 line=dict(width=1, color='deeppink')

mock_results/1.0.0-dev1/commit0.jsonl CHANGED Viewed

@@ -1,5 +1,5 @@
-{"agent_name": "1.0.2", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 71.2, "metric": "test_pass_rate", "submission_time": "2025-11-24T23:38:02.231539", "tags": ["commit0"], "total_cost": 45.6, "total_runtime": 656.0}
-{"agent_name": "1.0.1", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 68.9, "metric": "test_pass_rate", "submission_time": "2025-11-24T23:38:02.231556", "tags": ["commit0"], "total_cost": 44.45, "total_runtime": 644.5}
-{"agent_name": "1.0.0", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 61.5, "metric": "test_pass_rate", "submission_time": "2025-11-24T23:38:02.231563", "tags": ["commit0"], "total_cost": 40.75, "total_runtime": 607.5}
-{"agent_name": "0.9.5", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 55.3, "metric": "test_pass_rate", "submission_time": "2025-11-24T23:38:02.231569", "tags": ["commit0"], "total_cost": 37.65, "total_runtime": 576.5}
-{"agent_name": "0.9.0", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 52.1, "metric": "test_pass_rate", "submission_time": "2025-11-24T23:38:02.231574", "tags": ["commit0"], "total_cost": 36.05, "total_runtime": 560.5}

+{"agent_name": "1.0.2", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed", "score": 71.2, "metric": "test_pass_rate", "submission_time": "2025-11-25T00:12:22.972910", "tags": ["commit0"], "total_cost": 45.6, "total_runtime": 656.0}
+{"agent_name": "1.0.1", "llm_base": "gpt-4o-2024-11-20", "openness": "closed", "score": 68.9, "metric": "test_pass_rate", "submission_time": "2025-11-25T00:12:22.972929", "tags": ["commit0"], "total_cost": 44.45, "total_runtime": 644.5}
+{"agent_name": "1.0.0", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed", "score": 61.5, "metric": "test_pass_rate", "submission_time": "2025-11-25T00:12:22.972939", "tags": ["commit0"], "total_cost": 40.75, "total_runtime": 607.5}
+{"agent_name": "0.9.5", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed", "score": 55.3, "metric": "test_pass_rate", "submission_time": "2025-11-25T00:12:22.972947", "tags": ["commit0"], "total_cost": 37.65, "total_runtime": 576.5}
+{"agent_name": "0.9.0", "llm_base": "claude-3-opus-20240229", "openness": "closed", "score": 52.1, "metric": "test_pass_rate", "submission_time": "2025-11-25T00:12:22.972954", "tags": ["commit0"], "total_cost": 36.05, "total_runtime": 560.5}

mock_results/1.0.0-dev1/gaia.jsonl CHANGED Viewed

@@ -1,5 +1,5 @@
-{"agent_name": "1.0.2", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 58.7, "metric": "accuracy", "submission_time": "2025-11-24T23:38:02.231711", "tags": ["gaia"], "total_cost": 39.35, "total_runtime": 593.5}
-{"agent_name": "1.0.1", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 55.2, "metric": "accuracy", "submission_time": "2025-11-24T23:38:02.231728", "tags": ["gaia"], "total_cost": 37.6, "total_runtime": 576.0}
-{"agent_name": "1.0.0", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 48.3, "metric": "accuracy", "submission_time": "2025-11-24T23:38:02.231735", "tags": ["gaia"], "total_cost": 34.15, "total_runtime": 541.5}
-{"agent_name": "0.9.5", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 42.1, "metric": "accuracy", "submission_time": "2025-11-24T23:38:02.231741", "tags": ["gaia"], "total_cost": 31.05, "total_runtime": 510.5}
-{"agent_name": "0.9.0", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 39.4, "metric": "accuracy", "submission_time": "2025-11-24T23:38:02.231749", "tags": ["gaia"], "total_cost": 29.7, "total_runtime": 497.0}

+{"agent_name": "1.0.2", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed", "score": 58.7, "metric": "accuracy", "submission_time": "2025-11-25T00:12:22.973093", "tags": ["gaia"], "total_cost": 39.35, "total_runtime": 593.5}
+{"agent_name": "1.0.1", "llm_base": "gpt-4o-2024-11-20", "openness": "closed", "score": 55.2, "metric": "accuracy", "submission_time": "2025-11-25T00:12:22.973111", "tags": ["gaia"], "total_cost": 37.6, "total_runtime": 576.0}
+{"agent_name": "1.0.0", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed", "score": 48.3, "metric": "accuracy", "submission_time": "2025-11-25T00:12:22.973121", "tags": ["gaia"], "total_cost": 34.15, "total_runtime": 541.5}
+{"agent_name": "0.9.5", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed", "score": 42.1, "metric": "accuracy", "submission_time": "2025-11-25T00:12:22.973129", "tags": ["gaia"], "total_cost": 31.05, "total_runtime": 510.5}
+{"agent_name": "0.9.0", "llm_base": "claude-3-opus-20240229", "openness": "closed", "score": 39.4, "metric": "accuracy", "submission_time": "2025-11-25T00:12:22.973137", "tags": ["gaia"], "total_cost": 29.7, "total_runtime": 497.0}

mock_results/1.0.0-dev1/multi-swe-bench.jsonl CHANGED Viewed

@@ -1,5 +1,5 @@
-{"agent_name": "1.0.2", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 35.2, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.230869", "tags": ["multi-swe-bench"], "total_cost": 27.6, "total_runtime": 476.0}
-{"agent_name": "1.0.1", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 32.8, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.230889", "tags": ["multi-swe-bench"], "total_cost": 26.4, "total_runtime": 464.0}
-{"agent_name": "1.0.0", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 28.4, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.230899", "tags": ["multi-swe-bench"], "total_cost": 24.2, "total_runtime": 442.0}
-{"agent_name": "0.9.5", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 24.1, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.230908", "tags": ["multi-swe-bench"], "total_cost": 22.05, "total_runtime": 420.5}
-{"agent_name": "0.9.0", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 21.5, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.230917", "tags": ["multi-swe-bench"], "total_cost": 20.75, "total_runtime": 407.5}

+{"agent_name": "1.0.2", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed", "score": 35.2, "metric": "resolve_rate", "submission_time": "2025-11-25T00:12:22.972368", "tags": ["multi-swe-bench"], "total_cost": 27.6, "total_runtime": 476.0}
+{"agent_name": "1.0.1", "llm_base": "gpt-4o-2024-11-20", "openness": "closed", "score": 32.8, "metric": "resolve_rate", "submission_time": "2025-11-25T00:12:22.972389", "tags": ["multi-swe-bench"], "total_cost": 26.4, "total_runtime": 464.0}
+{"agent_name": "1.0.0", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed", "score": 28.4, "metric": "resolve_rate", "submission_time": "2025-11-25T00:12:22.972400", "tags": ["multi-swe-bench"], "total_cost": 24.2, "total_runtime": 442.0}
+{"agent_name": "0.9.5", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed", "score": 24.1, "metric": "resolve_rate", "submission_time": "2025-11-25T00:12:22.972408", "tags": ["multi-swe-bench"], "total_cost": 22.05, "total_runtime": 420.5}
+{"agent_name": "0.9.0", "llm_base": "claude-3-opus-20240229", "openness": "closed", "score": 21.5, "metric": "resolve_rate", "submission_time": "2025-11-25T00:12:22.972416", "tags": ["multi-swe-bench"], "total_cost": 20.75, "total_runtime": 407.5}

mock_results/1.0.0-dev1/swe-bench-multimodal.jsonl CHANGED Viewed

@@ -1,5 +1,5 @@
-{"agent_name": "1.0.2", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 42.1, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.231057", "tags": ["swe-bench-multimodal"], "total_cost": 31.05, "total_runtime": 510.5}
-{"agent_name": "1.0.1", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 39.5, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.231073", "tags": ["swe-bench-multimodal"], "total_cost": 29.75, "total_runtime": 497.5}
-{"agent_name": "1.0.0", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 34.2, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.231082", "tags": ["swe-bench-multimodal"], "total_cost": 27.1, "total_runtime": 471.0}
-{"agent_name": "0.9.5", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 28.9, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.231088", "tags": ["swe-bench-multimodal"], "total_cost": 24.45, "total_runtime": 444.5}
-{"agent_name": "0.9.0", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 25.7, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.231093", "tags": ["swe-bench-multimodal"], "total_cost": 22.85, "total_runtime": 428.5}

+{"agent_name": "1.0.2", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed", "score": 42.1, "metric": "resolve_rate", "submission_time": "2025-11-25T00:12:22.972550", "tags": ["swe-bench-multimodal"], "total_cost": 31.05, "total_runtime": 510.5}
+{"agent_name": "1.0.1", "llm_base": "gpt-4o-2024-11-20", "openness": "closed", "score": 39.5, "metric": "resolve_rate", "submission_time": "2025-11-25T00:12:22.972567", "tags": ["swe-bench-multimodal"], "total_cost": 29.75, "total_runtime": 497.5}
+{"agent_name": "1.0.0", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed", "score": 34.2, "metric": "resolve_rate", "submission_time": "2025-11-25T00:12:22.972577", "tags": ["swe-bench-multimodal"], "total_cost": 27.1, "total_runtime": 471.0}
+{"agent_name": "0.9.5", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed", "score": 28.9, "metric": "resolve_rate", "submission_time": "2025-11-25T00:12:22.972585", "tags": ["swe-bench-multimodal"], "total_cost": 24.45, "total_runtime": 444.5}
+{"agent_name": "0.9.0", "llm_base": "claude-3-opus-20240229", "openness": "closed", "score": 25.7, "metric": "resolve_rate", "submission_time": "2025-11-25T00:12:22.972593", "tags": ["swe-bench-multimodal"], "total_cost": 22.85, "total_runtime": 428.5}

mock_results/1.0.0-dev1/swe-bench.jsonl CHANGED Viewed

@@ -1,5 +1,5 @@
-{"agent_name": "1.0.2", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 48.3, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.230638", "tags": ["swe-bench"], "total_cost": 34.15, "total_runtime": 541.5}
-{"agent_name": "1.0.1", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 45.1, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.230668", "tags": ["swe-bench"], "total_cost": 32.55, "total_runtime": 525.5}
-{"agent_name": "1.0.0", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 38.7, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.230681", "tags": ["swe-bench"], "total_cost": 29.35, "total_runtime": 493.5}
-{"agent_name": "0.9.5", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 32.5, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.230689", "tags": ["swe-bench"], "total_cost": 26.25, "total_runtime": 462.5}
-{"agent_name": "0.9.0", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 29.8, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.230696", "tags": ["swe-bench"], "total_cost": 24.9, "total_runtime": 449.0}

+{"agent_name": "1.0.2", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed", "score": 48.3, "metric": "resolve_rate", "submission_time": "2025-11-25T00:12:22.972101", "tags": ["swe-bench"], "total_cost": 34.15, "total_runtime": 541.5}
+{"agent_name": "1.0.1", "llm_base": "gpt-4o-2024-11-20", "openness": "closed", "score": 45.1, "metric": "resolve_rate", "submission_time": "2025-11-25T00:12:22.972136", "tags": ["swe-bench"], "total_cost": 32.55, "total_runtime": 525.5}
+{"agent_name": "1.0.0", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed", "score": 38.7, "metric": "resolve_rate", "submission_time": "2025-11-25T00:12:22.972167", "tags": ["swe-bench"], "total_cost": 29.35, "total_runtime": 493.5}
+{"agent_name": "0.9.5", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed", "score": 32.5, "metric": "resolve_rate", "submission_time": "2025-11-25T00:12:22.972178", "tags": ["swe-bench"], "total_cost": 26.25, "total_runtime": 462.5}
+{"agent_name": "0.9.0", "llm_base": "claude-3-opus-20240229", "openness": "closed", "score": 29.8, "metric": "resolve_rate", "submission_time": "2025-11-25T00:12:22.972186", "tags": ["swe-bench"], "total_cost": 24.9, "total_runtime": 449.0}

mock_results/1.0.0-dev1/swt-bench.jsonl CHANGED Viewed

@@ -1,5 +1,5 @@
-{"agent_name": "1.0.2", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 65.4, "metric": "success_rate", "submission_time": "2025-11-24T23:38:02.231319", "tags": ["swt-bench"], "total_cost": 42.7, "total_runtime": 627.0}
-{"agent_name": "1.0.1", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 62.3, "metric": "success_rate", "submission_time": "2025-11-24T23:38:02.231344", "tags": ["swt-bench"], "total_cost": 41.15, "total_runtime": 611.5}
-{"agent_name": "1.0.0", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 54.1, "metric": "success_rate", "submission_time": "2025-11-24T23:38:02.231356", "tags": ["swt-bench"], "total_cost": 37.05, "total_runtime": 570.5}
-{"agent_name": "0.9.5", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 47.8, "metric": "success_rate", "submission_time": "2025-11-24T23:38:02.231365", "tags": ["swt-bench"], "total_cost": 33.9, "total_runtime": 539.0}
-{"agent_name": "0.9.0", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 44.2, "metric": "success_rate", "submission_time": "2025-11-24T23:38:02.231373", "tags": ["swt-bench"], "total_cost": 32.1, "total_runtime": 521.0}

+{"agent_name": "1.0.2", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed", "score": 65.4, "metric": "success_rate", "submission_time": "2025-11-25T00:12:22.972724", "tags": ["swt-bench"], "total_cost": 42.7, "total_runtime": 627.0}
+{"agent_name": "1.0.1", "llm_base": "gpt-4o-2024-11-20", "openness": "closed", "score": 62.3, "metric": "success_rate", "submission_time": "2025-11-25T00:12:22.972741", "tags": ["swt-bench"], "total_cost": 41.15, "total_runtime": 611.5}
+{"agent_name": "1.0.0", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed", "score": 54.1, "metric": "success_rate", "submission_time": "2025-11-25T00:12:22.972750", "tags": ["swt-bench"], "total_cost": 37.05, "total_runtime": 570.5}
+{"agent_name": "0.9.5", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed", "score": 47.8, "metric": "success_rate", "submission_time": "2025-11-25T00:12:22.972758", "tags": ["swt-bench"], "total_cost": 33.9, "total_runtime": 539.0}
+{"agent_name": "0.9.0", "llm_base": "claude-3-opus-20240229", "openness": "closed", "score": 44.2, "metric": "success_rate", "submission_time": "2025-11-25T00:12:22.972765", "tags": ["swt-bench"], "total_cost": 32.1, "total_runtime": 521.0}

simple_data_loader.py CHANGED Viewed

@@ -88,7 +88,6 @@ class SimpleLeaderboardViewer:
                     'agent_name': metadata.get('agent_name', 'Unknown'),
                     'llm_base': metadata.get('model', 'unknown'),
                     'openness': metadata.get('openness', 'unknown'),
-                    'tool_usage': metadata.get('tool_usage', 'standard'),
                     'submission_time': metadata.get('submission_time', ''),
                     'score': score_entry.get('score'),
                     'metric': score_entry.get('metric', 'unknown'),
@@ -152,12 +151,17 @@ class SimpleLeaderboardViewer:
                 # Build a single record for this agent
                 first_record = agent_records.iloc[0]
                 record = {
                     # Core agent info - use final display names
                     'agent': agent_name,  # Will become "Agent Version" after prettifying
                     'models used': first_record['llm_base'],  # Will become "Model"
-                    'openness': first_record['openness'],  # Will become "Openness"
-                    'agent tooling': first_record['tool_usage'],  # Will become "Agent Tooling"
                     'date': first_record['submission_time'],  # Will become "Date"
                     # Additional columns expected by the transformer
                     'id': first_record.get('id', agent_name),  # Will become "Id"

                     'agent_name': metadata.get('agent_name', 'Unknown'),
                     'llm_base': metadata.get('model', 'unknown'),
                     'openness': metadata.get('openness', 'unknown'),
                     'submission_time': metadata.get('submission_time', ''),
                     'score': score_entry.get('score'),
                     'metric': score_entry.get('metric', 'unknown'),
                 # Build a single record for this agent
                 first_record = agent_records.iloc[0]
+                # Normalize openness to "open" or "closed"
+                from aliases import OPENNESS_MAPPING
+                raw_openness = first_record['openness']
+                normalized_openness = OPENNESS_MAPPING.get(raw_openness, raw_openness)
                 record = {
                     # Core agent info - use final display names
                     'agent': agent_name,  # Will become "Agent Version" after prettifying
                     'models used': first_record['llm_base'],  # Will become "Model"
+                    'openness': normalized_openness,  # Will become "Openness" (simplified to "open" or "closed")
                     'date': first_record['submission_time'],  # Will become "Date"
                     # Additional columns expected by the transformer
                     'id': first_record.get('id', agent_name),  # Will become "Id"

ui_components.py CHANGED Viewed

@@ -36,73 +36,27 @@ from content import (
 api = HfApi()
 os.makedirs(EXTRACTED_DATA_DIR, exist_ok=True)
-# Global variables
-COMBINED_ICON_MAP = {
-    aliases.CANONICAL_OPENNESS_OPEN_SOURCE_OPEN_WEIGHTS: {
-        aliases.CANONICAL_TOOL_USAGE_STANDARD: "assets/os-ow-standard.svg",
-        aliases.CANONICAL_TOOL_USAGE_CUSTOM_INTERFACE: "assets/os-ow-equivalent.svg",
-        aliases.CANONICAL_TOOL_USAGE_FULLY_CUSTOM: "assets/os-ow-custom.svg",
-    },
-    aliases.CANONICAL_OPENNESS_OPEN_SOURCE_CLOSED_WEIGHTS: {
-        aliases.CANONICAL_TOOL_USAGE_STANDARD: "assets/os-standard.svg",
-        aliases.CANONICAL_TOOL_USAGE_CUSTOM_INTERFACE: "assets/os-equivalent.svg",
-        aliases.CANONICAL_TOOL_USAGE_FULLY_CUSTOM: "assets/os-custom.svg",
-    },
-    aliases.CANONICAL_OPENNESS_CLOSED_API_AVAILABLE: {
-        aliases.CANONICAL_TOOL_USAGE_STANDARD: "assets/api-standard.svg",
-        aliases.CANONICAL_TOOL_USAGE_CUSTOM_INTERFACE: "assets/api-equivalent.svg",
-        aliases.CANONICAL_TOOL_USAGE_FULLY_CUSTOM: "assets/api-custom.svg",
-    },
-    aliases.CANONICAL_OPENNESS_CLOSED_UI_ONLY: {
-        aliases.CANONICAL_TOOL_USAGE_STANDARD: "assets/c-standard.svg",
-        aliases.CANONICAL_TOOL_USAGE_CUSTOM_INTERFACE: "assets/c-equivalent.svg",
-        aliases.CANONICAL_TOOL_USAGE_FULLY_CUSTOM: "assets/c-custom.svg",
-    }
 }
-# it's important to do the tool usage first here, so that when
-# we do openness, the tool usage changes get picked up
-for openness in COMBINED_ICON_MAP:
-    for canonical_tool_usage, tool_usage_aliases in aliases.TOOL_USAGE_ALIASES.items():
-        for tool_usage_alias in tool_usage_aliases:
-            COMBINED_ICON_MAP[openness][tool_usage_alias] = COMBINED_ICON_MAP[openness][canonical_tool_usage]
 for canonical_openness, openness_aliases in aliases.OPENNESS_ALIASES.items():
     for openness_alias in openness_aliases:
-        COMBINED_ICON_MAP[openness_alias] = COMBINED_ICON_MAP[canonical_openness]
 OPENNESS_SVG_MAP = {
-    aliases.CANONICAL_OPENNESS_OPEN_SOURCE_OPEN_WEIGHTS: {
         "path": "assets/ellipse-pink.svg",
-        "description": "Code and models are open"
-    },
-    aliases.CANONICAL_OPENNESS_OPEN_SOURCE_CLOSED_WEIGHTS: {
-        "path": "assets/ellipse-coral.svg",
-        "description": "Code is open but uses closed-weight models"
     },
-    aliases.CANONICAL_OPENNESS_CLOSED_API_AVAILABLE: {
         "path": "assets/ellipse-yellow.svg",
-        "description": "No access to code; API access only"
-    },
-    aliases.CANONICAL_OPENNESS_CLOSED_UI_ONLY: {
-        "path": "assets/ellipse-white.svg",
-        "description": "No access to code or API; UI access only"
-    },
-}
-TOOLING_SVG_MAP = {
-    aliases.CANONICAL_TOOL_USAGE_STANDARD: {
-        "path": "assets/five-point-star.svg",
-        "description": "Uses only tools explicitly provided in state.tools"
-    },
-    aliases.CANONICAL_TOOL_USAGE_CUSTOM_INTERFACE: {
-        "path": "assets/four-point-star.svg",
-        "description": "Custom tools for accessing an equivalent underlying environment"
-    },
-    aliases.CANONICAL_TOOL_USAGE_FULLY_CUSTOM: {
-        "path": "assets/three-point-star.svg",
-        "description": f"Uses tools beyond constraints of {aliases.CANONICAL_TOOL_USAGE_STANDARD} or {aliases.CANONICAL_TOOL_USAGE_CUSTOM_INTERFACE}"
     },
 }
@@ -116,29 +70,6 @@ def get_svg_as_data_uri(path: str) -> str:
         print(f"Warning: SVG file not found at {path}")
         return ""
-# Create a pre-loaded version of our map. This should be run ONCE when the app starts.
-PRELOADED_URI_MAP = {
-    openness: {
-        tooling: get_svg_as_data_uri(path)
-        for tooling, path in tooling_map.items()
-    }
-    for openness, tooling_map in COMBINED_ICON_MAP.items()
-}
-def get_combined_icon_html(row, uri_map):
-    """
-    Looks up the correct icon URI from the pre-loaded map based on the row's
-    'Openness' and 'Agent Tooling' values and returns an HTML <img> tag.
-    """
-    openness_val = row['Openness']
-    tooling_val = row['Agent Tooling']
-    uri = uri_map.get(openness_val, {}).get(tooling_val, "")
-    # The tooltip will show the exact combination for clarity.
-    tooltip = f"Openness: {openness_val}, Tooling: {tooling_val}"
-    # Return the HTML string that Gradio will render in the DataFrame.
-    return f'<img src="{uri}" alt="{tooltip}" title="{tooltip}" style="width:24px; height:24px;">'
 def create_svg_html(value, svg_map):
     """
     Generates the absolute simplest HTML for an icon, without any extra text.
@@ -162,18 +93,12 @@ def create_svg_html(value, svg_map):
 def build_openness_tooltip_content() -> str:
     """
-    Generates the inner HTML for the Agent Openness tooltip card,
     """
-    descriptions = {
-        aliases.CANONICAL_OPENNESS_OPEN_SOURCE_OPEN_WEIGHTS: "Both code and ML models are open",
-        aliases.CANONICAL_OPENNESS_OPEN_SOURCE_CLOSED_WEIGHTS: "Code is open but uses an ML model with closed-weights",
-        aliases.CANONICAL_OPENNESS_CLOSED_API_AVAILABLE: "No access to code; API access only",
-        aliases.CANONICAL_OPENNESS_CLOSED_UI_ONLY: "No access to code or API; UI  access only",
-    }
     html_items = []
     for name, info in OPENNESS_SVG_MAP.items():
         uri = get_svg_as_data_uri(info["path"])
-        desc = descriptions.get(name, "")
         html_items.append(f"""
             <div class="tooltip-legend-item">
@@ -190,8 +115,8 @@ def build_openness_tooltip_content() -> str:
     return f"""<span class="tooltip-icon-legend">
         ⓘ
         <span class="tooltip-card">
-            <h3>Agent Openness</h3>
-            <p class="tooltip-description">Indicates how transparent and reproducible an agent is.</p>
             <div class="tooltip-items-container">{joined_items}</div>
         </span>
     </span>"""
@@ -215,48 +140,7 @@ def build_pareto_tooltip_content() -> str:
         </div>
     """
-def build_tooling_tooltip_content() -> str:
-    """Generates the inner HTML for the Agent Tooling tooltip card."""
-    descriptions = {
-        aliases.CANONICAL_TOOL_USAGE_STANDARD: "Uses only predefined tools from the evaluation environment (as defined in Inspect's state.tools).",
-        aliases.CANONICAL_TOOL_USAGE_CUSTOM_INTERFACE: "Custom tools for accessing an equivalent underlying environment:",
-        aliases.CANONICAL_TOOL_USAGE_FULLY_CUSTOM: f"Uses tools beyond constraints of {aliases.CANONICAL_TOOL_USAGE_STANDARD} or {aliases.CANONICAL_TOOL_USAGE_CUSTOM_INTERFACE}",
-    }
-    custom_interface_sub_list = """
-        <ul class="tooltip-sub-list">
-            <li>Literature tasks: Information access is limited to date-restricted usage of the Asta MCP tools.</li>
-            <li>Code tasks: Code execution is limited to an iPython shell in a machine environment initialized with the standard Asta sandbox Dockerfile (or equivalent).</li>
-        </ul>
-    """
-    html_items = []
-    for name, info in TOOLING_SVG_MAP.items():
-        uri = get_svg_as_data_uri(info["path"])
-        desc = descriptions.get(name, "")
-        # Check if this is the special case that needs a sub-list
-        sub_list_html = custom_interface_sub_list if name == aliases.CANONICAL_TOOL_USAGE_CUSTOM_INTERFACE else ""
-        html_items.append(f"""
-            <div class="tooltip-legend-item">
-                <img src="{uri}" alt="{name}">
-                <div>
-                    <strong>{name}</strong>
-                    <span>{desc}</span>
-                    {sub_list_html}
-                </div>
-            </div>
-        """)
-    joined_items = "".join(html_items)
-    return f"""<span class="tooltip-icon-legend">
-        ⓘ
-        <span class="tooltip-card">
-            <h3>Agent Tooling</h3>
-            <p class="tooltip-description">Describes the tool usage and execution environment of the agent during evaluation.</p>
-            <div class="tooltip-items-container">{joined_items}</div>
-        </span>
-    </span>"""
 def build_descriptions_tooltip_content(table) -> str:
@@ -303,9 +187,6 @@ def build_descriptions_tooltip_content(table) -> str:
             <div class="tooltip-description-item"><b>Logs:</b> View evaluation run logs (e.g., outputs, traces).</div>
         """
-# Dynamically generate the correct HTML for the legend parts
-openness_html = " ".join([create_svg_html(name, OPENNESS_SVG_MAP) for name in OPENNESS_SVG_MAP])
-tooling_html = " ".join([create_svg_html(name, TOOLING_SVG_MAP) for name in TOOLING_SVG_MAP])
 # Create HTML for the "Openness" legend items for table
 openness_html_items = []
 for name, info in OPENNESS_SVG_MAP.items():
@@ -319,21 +200,8 @@ for name, info in OPENNESS_SVG_MAP.items():
     )
 openness_html = " ".join(openness_html_items)
-# Create HTML for the "Tooling" legend items for table
-tooling_html_items = []
-for name, info in TOOLING_SVG_MAP.items():
-    uri = get_svg_as_data_uri(info["path"])
-    tooling_html_items.append(
-        f'<div style="display: flex; align-items: center; white-space: nowrap;">'
-        f'<img src="{uri}" alt="{name}" title="{name}" style="width:16px; height:16px; margin-right: 4px; flex-shrink: 0;">'
-        f'<span>{name}</span>'
-        f'</div>'
-    )
-tooling_html = " ".join(tooling_html_items)
 pareto_tooltip_content = build_pareto_tooltip_content()
 openness_tooltip_content = build_openness_tooltip_content()
-tooling_tooltip_content = build_tooling_tooltip_content()
 def create_legend_markdown(which_table: str) -> str:
     """
@@ -358,16 +226,10 @@ def create_legend_markdown(which_table: str) -> str:
         </div>
         <div> <!-- Container for the Openness section -->
-            <b>Agent Openness</b>
             {openness_tooltip_content}
             <div class="table-legend-item">{openness_html}</div>
         </div>
-        <div> <!-- Container for the Tooling section -->
-            <b>Agent Tooling</b>
-            {tooling_tooltip_content}
-            <div class="table-legend-item">{tooling_html}</div>
-        </div>
         <div><!-- Container for the Column Descriptions section -->
             <b>Column Descriptions</b>
@@ -400,22 +262,6 @@ for name, info in OPENNESS_SVG_MAP.items():
             f'</div>'
         )
-tooling_legend_items = []
-for name, info in TOOLING_SVG_MAP.items():
-    uri = get_svg_as_data_uri(info["path"])
-    if uri:
-        tooling_legend_items.append(
-            f'<div class="plot-legend-item">'
-                f'<img class="plot-legend-item-svg plot-legend-tooling-svg" src="{uri}" alt="{name}" title="{name}">'
-                f'<div class="plot-legend-item-text">'
-                    f'<div>'
-                        f'<span>{name}</span>'
-                    f'</div>'
-                    f'<span class="description">{info["description"]}</span>'
-                f'</div>'
-            f'</div>'
-        )
 plot_legend_html = f"""
     <div class="plot-legend-container">
         <div id="plot-legend-logo">
@@ -430,16 +276,10 @@ plot_legend_html = f"""
                 </div>
             </div>
         </div>
-        <div style="margin-bottom: 16px;">
-            <span class="plot-legend-category-heading">Agent Openness</span>
-            <div style="margin-top: 8px;">
-                {''.join(openness_legend_items)}
-            </div>
-        </div>
         <div>
-            <span class="plot-legend-category-heading">Agent Tooling</span>
             <div style="margin-top: 8px;">
-                {''.join(tooling_legend_items)}
             </div>
         </div>
     </div>
@@ -666,7 +506,7 @@ def create_benchmark_details_display(
         benchmark_cost_col = f"{benchmark_name} Cost"
         # Define the columns needed for the detailed table
-        table_cols = ['Agent','Source','Openness','Agent Tooling', 'Submitter', 'Date', benchmark_score_col, benchmark_cost_col,'Logs','id', 'Models Used']
         # Filter to only columns that actually exist in the full dataframe
         existing_table_cols = [col for col in table_cols if col in full_df.columns]
@@ -690,10 +530,13 @@ def create_benchmark_details_display(
             axis=1
         )
-        benchmark_table_df['Icon'] = benchmark_table_df.apply(
-            lambda row: get_combined_icon_html(row, PRELOADED_URI_MAP),
-            axis=1  # IMPORTANT: axis=1 tells pandas to process row-by-row
-        )
         #Make pretty and format the Models Used column
         benchmark_table_df['Models Used'] = benchmark_table_df['Models Used'].apply(clean_llm_base_list)

 api = HfApi()
 os.makedirs(EXTRACTED_DATA_DIR, exist_ok=True)
+# Simplified icon map (no tooling distinction, only openness)
+# Not actually used since we removed icons from the table, but keeping for potential future use
+OPENNESS_ICON_MAP = {
+    aliases.CANONICAL_OPENNESS_OPEN: "assets/ellipse-pink.svg",
+    aliases.CANONICAL_OPENNESS_CLOSED: "assets/ellipse-yellow.svg",
 }
+# Add aliases
 for canonical_openness, openness_aliases in aliases.OPENNESS_ALIASES.items():
     for openness_alias in openness_aliases:
+        OPENNESS_ICON_MAP[openness_alias] = OPENNESS_ICON_MAP[canonical_openness]
 OPENNESS_SVG_MAP = {
+    aliases.CANONICAL_OPENNESS_OPEN: {
         "path": "assets/ellipse-pink.svg",
+        "description": "Open source model"
     },
+    aliases.CANONICAL_OPENNESS_CLOSED: {
         "path": "assets/ellipse-yellow.svg",
+        "description": "Closed source model"
     },
 }
         print(f"Warning: SVG file not found at {path}")
         return ""
 def create_svg_html(value, svg_map):
     """
     Generates the absolute simplest HTML for an icon, without any extra text.
 def build_openness_tooltip_content() -> str:
     """
+    Generates the inner HTML for the Model Openness tooltip card.
     """
     html_items = []
     for name, info in OPENNESS_SVG_MAP.items():
         uri = get_svg_as_data_uri(info["path"])
+        desc = info["description"]
         html_items.append(f"""
             <div class="tooltip-legend-item">
     return f"""<span class="tooltip-icon-legend">
         ⓘ
         <span class="tooltip-card">
+            <h3>Model Openness</h3>
+            <p class="tooltip-description">Indicates whether the language model is open source or closed source.</p>
             <div class="tooltip-items-container">{joined_items}</div>
         </span>
     </span>"""
         </div>
     """
 def build_descriptions_tooltip_content(table) -> str:
             <div class="tooltip-description-item"><b>Logs:</b> View evaluation run logs (e.g., outputs, traces).</div>
         """
 # Create HTML for the "Openness" legend items for table
 openness_html_items = []
 for name, info in OPENNESS_SVG_MAP.items():
     )
 openness_html = " ".join(openness_html_items)
 pareto_tooltip_content = build_pareto_tooltip_content()
 openness_tooltip_content = build_openness_tooltip_content()
 def create_legend_markdown(which_table: str) -> str:
     """
         </div>
         <div> <!-- Container for the Openness section -->
+            <b>Model Openness</b>
             {openness_tooltip_content}
             <div class="table-legend-item">{openness_html}</div>
         </div>
         <div><!-- Container for the Column Descriptions section -->
             <b>Column Descriptions</b>
             f'</div>'
         )
 plot_legend_html = f"""
     <div class="plot-legend-container">
         <div id="plot-legend-logo">
                 </div>
             </div>
         </div>
         <div>
+            <span class="plot-legend-category-heading">Model Openness</span>
             <div style="margin-top: 8px;">
+                {''.join(openness_legend_items)}
             </div>
         </div>
     </div>
         benchmark_cost_col = f"{benchmark_name} Cost"
         # Define the columns needed for the detailed table
+        table_cols = ['Agent','Source','Openness', 'Date', benchmark_score_col, benchmark_cost_col,'Logs','id', 'Models Used']
         # Filter to only columns that actually exist in the full dataframe
         existing_table_cols = [col for col in table_cols if col in full_df.columns]
             axis=1
         )
+        # Create simple openness icons
+        def get_openness_icon_html(row):
+            openness_val = row.get('Openness', '')
+            uri = get_svg_as_data_uri(OPENNESS_ICON_MAP.get(openness_val, "assets/ellipse-pink.svg"))
+            return f'<img src="{uri}" alt="{openness_val}" title="{openness_val}" style="width:24px; height:24px;">'
+        benchmark_table_df['Icon'] = benchmark_table_df.apply(get_openness_icon_html, axis=1)
         #Make pretty and format the Models Used column
         benchmark_table_df['Models Used'] = benchmark_table_df['Models Used'].apply(clean_llm_base_list)