Spaces:

OpenHands
/

openhands-index

Running

openhands openhands commited on 18 days ago

Commit

855423e

1 Parent(s): 64c8899

Cleanup codebase: remove unused code, simplify data loading, and add pre-release notice

- Remove unused github_data_loader.py (71 lines of dead code)
- Clean up debug logging in app.py (replaced ~40 print statements with proper logger)
- Remove scheduler code and APScheduler dependency from requirements.txt
- Simplify simple_data_loader.py by removing old JSONL format fallback
- Update setup_data.py to fetch data from results/ directory only
- Update mock data to use new agent-centric directory structure
- Add pre-release notice to main page intro paragraph

Co-authored-by: openhands <[email protected]>

Files changed (27) hide show

app.py +14 -68
content.py +4 -0
github_data_loader.py +0 -71
mock_results/1.0.0-dev1/agenteval.json +0 -74
mock_results/1.0.0-dev1/commit0.jsonl +0 -5
mock_results/1.0.0-dev1/gaia.jsonl +0 -5
mock_results/1.0.0-dev1/multi-swe-bench.jsonl +0 -5
mock_results/1.0.0-dev1/results/20251124_claude_3_5_sonnet_20241022/metadata.json +9 -0
mock_results/1.0.0-dev1/results/20251124_claude_3_5_sonnet_20241022/scores.json +62 -0
mock_results/1.0.0-dev1/results/20251124_claude_3_opus_20240229/metadata.json +9 -0
mock_results/1.0.0-dev1/results/20251124_claude_3_opus_20240229/scores.json +62 -0
mock_results/1.0.0-dev1/results/20251124_gpt_4_turbo_2024_04_09/metadata.json +9 -0
mock_results/1.0.0-dev1/results/20251124_gpt_4_turbo_2024_04_09/scores.json +62 -0
mock_results/1.0.0-dev1/results/20251124_gpt_4o_2024_11_20/metadata.json +9 -0
mock_results/1.0.0-dev1/results/20251124_gpt_4o_2024_11_20/scores.json +62 -0
mock_results/1.0.0-dev1/results/20251124_gpt_4o_mini_2024_07_18/metadata.json +9 -0
mock_results/1.0.0-dev1/results/20251124_gpt_4o_mini_2024_07_18/scores.json +62 -0
mock_results/1.0.0-dev1/swe-bench-multimodal.jsonl +0 -5
mock_results/1.0.0-dev1/swe-bench.jsonl +0 -5
mock_results/1.0.0-dev1/swt-bench.jsonl +0 -5
mock_results/1.0.0-dev1/test.jsonl +0 -30
mock_results/1.0.0-dev1/test.parquet +0 -0
mock_results/1.0.0-dev1/validation.jsonl +0 -30
mock_results/1.0.0-dev1/validation.parquet +0 -0
requirements.txt +0 -1
setup_data.py +16 -20
simple_data_loader.py +6 -33

app.py CHANGED Viewed

@@ -2,56 +2,34 @@
 import logging
 import sys
-logging.basicConfig(level=logging.INFO)  # Changed to INFO for better debugging
-print("=" * 80, file=sys.stderr)
-print("STARTING APP.PY", file=sys.stderr)
-print("=" * 80, file=sys.stderr)
 # Setup mock data before anything else
 try:
-    print("Importing setup_data module...", file=sys.stderr)
     from setup_data import setup_mock_data
-    print("Calling setup_mock_data()...", file=sys.stderr)
     setup_mock_data()
-    print("✓ Data setup completed successfully", file=sys.stderr)
 except Exception as e:
-    print(f"!!! ERROR during data setup: {e}", file=sys.stderr)
-    import traceback
-    traceback.print_exc()
-    print("Continuing with app startup despite error...", file=sys.stderr)
-print("Importing gradio...", file=sys.stderr)
 import gradio as gr
-print("✓ Gradio imported", file=sys.stderr)
 import urllib.parse
-print("Importing dependencies...", file=sys.stderr)
-from apscheduler.schedulers.background import BackgroundScheduler
 from huggingface_hub import HfApi
-print("✓ Dependencies imported", file=sys.stderr)
-print("Importing config...", file=sys.stderr)
 from config import LEADERBOARD_PATH, LOCAL_DEBUG
-print(f"✓ Config imported (LOCAL_DEBUG={LOCAL_DEBUG})", file=sys.stderr)
-print("Importing content and pages...", file=sys.stderr)
 from content import css
-print("✓ css imported", file=sys.stderr)
 from main_page import build_page as build_main_page
-print("✓ main_page imported", file=sys.stderr)
 from bug_fixing import build_page as build_bug_fixing_page
-print("✓ bug_fixing imported", file=sys.stderr)
 from app_creation import build_page as build_app_creation_page
-print("✓ app_creation imported", file=sys.stderr)
 from frontend_development import build_page as build_frontend_page
-print("✓ frontend_development imported", file=sys.stderr)
 from test_generation import build_page as build_test_generation_page
-print("✓ test_generation imported", file=sys.stderr)
 from information_gathering import build_page as build_information_gathering_page
-print("✓ information_gathering imported", file=sys.stderr)
 from about import build_page as build_about_page
-print("✓ All pages imported", file=sys.stderr)
 api = HfApi()
 LOGO_PATH = "assets/logo.svg"
@@ -193,7 +171,7 @@ try:
     encoded_svg = urllib.parse.quote(svg_content)
     home_icon_data_uri = f"data:image/svg+xml,{encoded_svg}"
 except FileNotFoundError:
-    print(f"Warning: Home icon file not found at {LOGO_PATH}.")
     home_icon_data_uri = "none"
 # --- This is the final CSS ---
@@ -228,77 +206,45 @@ final_css = css + f"""
 }}
 """
 # --- Gradio App Definition ---
-print("Creating Gradio Blocks...", file=sys.stderr)
 demo = gr.Blocks(
     theme=theme,
     css=final_css,
     head=scroll_script + redirect_script + tooltip_script,
     title="OpenHands Index",
 )
-print("✓ Gradio Blocks created", file=sys.stderr)
-print("Building Home page route...", file=sys.stderr)
 with demo.route("Home", "/home"):
     build_main_page()
-print("✓ Home page built", file=sys.stderr)
-print("Building Bug Fixing page route...", file=sys.stderr)
 with demo.route("Bug Fixing", "/bug-fixing"):
     build_bug_fixing_page()
-print("✓ Bug Fixing page built", file=sys.stderr)
-print("Building App Creation page route...", file=sys.stderr)
 with demo.route("App Creation", "/app-creation"):
     build_app_creation_page()
-print("✓ App Creation page built", file=sys.stderr)
-print("Building Frontend Development page route...", file=sys.stderr)
 with demo.route("Frontend Development", "/frontend-development"):
     build_frontend_page()
-print("✓ Frontend Development page built", file=sys.stderr)
-print("Building Test Generation page route...", file=sys.stderr)
 with demo.route("Test Generation", "/test-generation"):
     build_test_generation_page()
-print("✓ Test Generation page built", file=sys.stderr)
-print("Building Information Gathering page route...", file=sys.stderr)
 with demo.route("Information Gathering", "/information-gathering"):
     build_information_gathering_page()
-print("✓ Information Gathering page built", file=sys.stderr)
-print("Building About page route...", file=sys.stderr)
 with demo.route("About", "/about"):
     build_about_page()
-print("✓ About page built", file=sys.stderr)
-# --- Scheduler and Launch
-def restart_space_job():
-    print("Scheduler: Attempting to restart space.")
-    try:
-        api.restart_space(repo_id=LEADERBOARD_PATH)
-        print("Scheduler: Space restart request sent.")
-    except Exception as e:
-        print(f"Scheduler: Error restarting space: {e}")
-# Disabled scheduler for now
-# scheduler = BackgroundScheduler(timezone="UTC")
-# scheduler.add_job(restart_space_job, "interval", hours=1)
-# scheduler.start()
 # Launch the Gradio app
 if __name__ == "__main__":
-    print("=" * 80, file=sys.stderr)
-    print("READY TO LAUNCH GRADIO APP", file=sys.stderr)
-    print("=" * 80, file=sys.stderr)
     if LOCAL_DEBUG:
-        print("Launching in LOCAL_DEBUG mode...", file=sys.stderr)
         demo.launch(debug=True, allowed_paths=["assets"], favicon_path="assets/favicon/favicon.ico")
     else:
-        print("Launching in Space mode...", file=sys.stderr)
-        # For Spaces, share=False is typical unless specific tunneling is needed.
-        # debug=True can be set to False for a "production" Space.
         demo.launch(server_name="0.0.0.0", server_port=7860, debug=True, share=False, allowed_paths=["assets"], favicon_path="assets/favicon/favicon.ico")
-    print("✓ Gradio app launched successfully!", file=sys.stderr)

 import logging
 import sys
+logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
+logger = logging.getLogger(__name__)
+logger.info("Starting OpenHands Index application")
 # Setup mock data before anything else
 try:
     from setup_data import setup_mock_data
     setup_mock_data()
+    logger.info("Data setup completed successfully")
 except Exception as e:
+    logger.error(f"Error during data setup: {e}", exc_info=True)
+    logger.warning("Continuing with app startup despite error")
 import gradio as gr
 import urllib.parse
 from huggingface_hub import HfApi
 from config import LEADERBOARD_PATH, LOCAL_DEBUG
 from content import css
 from main_page import build_page as build_main_page
 from bug_fixing import build_page as build_bug_fixing_page
 from app_creation import build_page as build_app_creation_page
 from frontend_development import build_page as build_frontend_page
 from test_generation import build_page as build_test_generation_page
 from information_gathering import build_page as build_information_gathering_page
 from about import build_page as build_about_page
+logger.info(f"All modules imported (LOCAL_DEBUG={LOCAL_DEBUG})")
 api = HfApi()
 LOGO_PATH = "assets/logo.svg"
     encoded_svg = urllib.parse.quote(svg_content)
     home_icon_data_uri = f"data:image/svg+xml,{encoded_svg}"
 except FileNotFoundError:
+    logger.warning(f"Home icon file not found at {LOGO_PATH}")
     home_icon_data_uri = "none"
 # --- This is the final CSS ---
 }}
 """
 # --- Gradio App Definition ---
+logger.info("Creating Gradio application")
 demo = gr.Blocks(
     theme=theme,
     css=final_css,
     head=scroll_script + redirect_script + tooltip_script,
     title="OpenHands Index",
 )
 with demo.route("Home", "/home"):
     build_main_page()
 with demo.route("Bug Fixing", "/bug-fixing"):
     build_bug_fixing_page()
 with demo.route("App Creation", "/app-creation"):
     build_app_creation_page()
 with demo.route("Frontend Development", "/frontend-development"):
     build_frontend_page()
 with demo.route("Test Generation", "/test-generation"):
     build_test_generation_page()
 with demo.route("Information Gathering", "/information-gathering"):
     build_information_gathering_page()
 with demo.route("About", "/about"):
     build_about_page()
+logger.info("All routes configured")
 # Launch the Gradio app
 if __name__ == "__main__":
     if LOCAL_DEBUG:
+        logger.info("Launching in LOCAL_DEBUG mode")
         demo.launch(debug=True, allowed_paths=["assets"], favicon_path="assets/favicon/favicon.ico")
     else:
+        logger.info("Launching in Space mode")
         demo.launch(server_name="0.0.0.0", server_port=7860, debug=True, share=False, allowed_paths=["assets"], favicon_path="assets/favicon/favicon.ico")
+    logger.info("Gradio app launched successfully")

content.py CHANGED Viewed

@@ -16,6 +16,10 @@ def create_gradio_anchor_id(text: str, validation) -> str:
 TITLE = """<h1 align="left" id="space-title">OpenHands Index</h1>"""
 INTRO_PARAGRAPH = """
 <p>
     <strong>OpenHands Index</strong> provides an aggregated view of agent performance and efficiency across all benchmarks in all categories. We report:
 </p>

 TITLE = """<h1 align="left" id="space-title">OpenHands Index</h1>"""
 INTRO_PARAGRAPH = """
+<p>
+    <b>Pre-release:</b> this codebase is not yet released.
+</p>
 <p>
     <strong>OpenHands Index</strong> provides an aggregated view of agent performance and efficiency across all benchmarks in all categories. We report:
 </p>

github_data_loader.py DELETED Viewed

@@ -1,71 +0,0 @@
-"""
-Custom data loader for OpenHands Index that fetches from GitHub instead of HF datasets.
-Mimics the interface of LeaderboardViewer from agent-eval.
-"""
-import pandas as pd
-import requests
-from typing import Dict, List, Tuple
-class GitHubDataLoader:
-    """Loads leaderboard data from GitHub repository."""
-    def __init__(self, base_url: str, split: str):
-        self.base_url = base_url
-        self.split = split
-        self.tag_map = self._build_tag_map()
-    def _build_tag_map(self) -> Dict[str, List[str]]:
-        """Build tag map for the OpenHands datasets."""
-        # Map datasets to their respective tags
-        return {
-            "swe-bench": ["swe-bench"],
-            "multi-swe-bench": ["multi-swe-bench"],
-            "swe-bench-multimodal": ["swe-bench-multimodal"],
-            "swt-bench": ["swt-bench"],
-            "commit0": ["commit0"],
-            "gaia": ["gaia"],
-        }
-    def _load(self) -> Tuple[pd.DataFrame, Dict]:
-        """Load and combine data from all GitHub JSON files."""
-        all_results = []
-        datasets = ["swe-bench", "multi-swe-bench", "swe-bench-multimodal",
-                   "swt-bench", "commit0", "gaia"]
-        for dataset in datasets:
-            url = f"{self.base_url}/{dataset}.json"
-            try:
-                response = requests.get(url, timeout=10)
-                if response.status_code == 200:
-                    data = response.json()
-                    # Transform GitHub data to match agenteval format
-                    for entry in data:
-                        all_results.append({
-                            "agent_name": entry.get("agent_name", "Unknown"),
-                            "score": entry.get("score", 0.0),
-                            "dataset": dataset,
-                            "split": self.split,
-                            # Add other fields as needed
-                        })
-            except Exception as e:
-                print(f"Warning: Could not load data from {url}: {e}")
-                continue
-        if all_results:
-            df = pd.DataFrame(all_results)
-            return df, self.tag_map
-        else:
-            return pd.DataFrame(), self.tag_map
-class DummyViewer:
-    """Fallback viewer when data loading fails."""
-    def __init__(self, df: pd.DataFrame):
-        self._df = df
-        self.tag_map = {"Overall": []}
-    def _load(self) -> Tuple[pd.DataFrame, Dict]:
-        return self._df, self.tag_map

mock_results/1.0.0-dev1/agenteval.json DELETED Viewed

@@ -1,74 +0,0 @@
-{
-  "suite_config": {
-    "name": "openhands-index",
-    "version": "1.0.0-dev1",
-    "splits": [
-      {
-        "name": "swe-bench",
-        "tasks": [
-          {
-            "name": "swe-bench",
-            "tags": [
-              "swe-bench"
-            ]
-          }
-        ]
-      },
-      {
-        "name": "multi-swe-bench",
-        "tasks": [
-          {
-            "name": "multi-swe-bench",
-            "tags": [
-              "multi-swe-bench"
-            ]
-          }
-        ]
-      },
-      {
-        "name": "swe-bench-multimodal",
-        "tasks": [
-          {
-            "name": "swe-bench-multimodal",
-            "tags": [
-              "swe-bench-multimodal"
-            ]
-          }
-        ]
-      },
-      {
-        "name": "swt-bench",
-        "tasks": [
-          {
-            "name": "swt-bench",
-            "tags": [
-              "swt-bench"
-            ]
-          }
-        ]
-      },
-      {
-        "name": "commit0",
-        "tasks": [
-          {
-            "name": "commit0",
-            "tags": [
-              "commit0"
-            ]
-          }
-        ]
-      },
-      {
-        "name": "gaia",
-        "tasks": [
-          {
-            "name": "gaia",
-            "tags": [
-              "gaia"
-            ]
-          }
-        ]
-      }
-    ]
-  }
-}

mock_results/1.0.0-dev1/commit0.jsonl DELETED Viewed

@@ -1,5 +0,0 @@
-{"agent_name": "1.0.2", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed", "score": 71.2, "metric": "test_pass_rate", "submission_time": "2025-11-25T00:12:22.972910", "tags": ["commit0"], "total_cost": 45.6, "total_runtime": 656.0}
-{"agent_name": "1.0.1", "llm_base": "gpt-4o-2024-11-20", "openness": "closed", "score": 68.9, "metric": "test_pass_rate", "submission_time": "2025-11-25T00:12:22.972929", "tags": ["commit0"], "total_cost": 44.45, "total_runtime": 644.5}
-{"agent_name": "1.0.0", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed", "score": 61.5, "metric": "test_pass_rate", "submission_time": "2025-11-25T00:12:22.972939", "tags": ["commit0"], "total_cost": 40.75, "total_runtime": 607.5}
-{"agent_name": "0.9.5", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed", "score": 55.3, "metric": "test_pass_rate", "submission_time": "2025-11-25T00:12:22.972947", "tags": ["commit0"], "total_cost": 37.65, "total_runtime": 576.5}
-{"agent_name": "0.9.0", "llm_base": "claude-3-opus-20240229", "openness": "closed", "score": 52.1, "metric": "test_pass_rate", "submission_time": "2025-11-25T00:12:22.972954", "tags": ["commit0"], "total_cost": 36.05, "total_runtime": 560.5}

mock_results/1.0.0-dev1/gaia.jsonl DELETED Viewed

@@ -1,5 +0,0 @@
-{"agent_name": "1.0.2", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed", "score": 58.7, "metric": "accuracy", "submission_time": "2025-11-25T00:12:22.973093", "tags": ["gaia"], "total_cost": 39.35, "total_runtime": 593.5}
-{"agent_name": "1.0.1", "llm_base": "gpt-4o-2024-11-20", "openness": "closed", "score": 55.2, "metric": "accuracy", "submission_time": "2025-11-25T00:12:22.973111", "tags": ["gaia"], "total_cost": 37.6, "total_runtime": 576.0}
-{"agent_name": "1.0.0", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed", "score": 48.3, "metric": "accuracy", "submission_time": "2025-11-25T00:12:22.973121", "tags": ["gaia"], "total_cost": 34.15, "total_runtime": 541.5}
-{"agent_name": "0.9.5", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed", "score": 42.1, "metric": "accuracy", "submission_time": "2025-11-25T00:12:22.973129", "tags": ["gaia"], "total_cost": 31.05, "total_runtime": 510.5}
-{"agent_name": "0.9.0", "llm_base": "claude-3-opus-20240229", "openness": "closed", "score": 39.4, "metric": "accuracy", "submission_time": "2025-11-25T00:12:22.973137", "tags": ["gaia"], "total_cost": 29.7, "total_runtime": 497.0}

mock_results/1.0.0-dev1/multi-swe-bench.jsonl DELETED Viewed

@@ -1,5 +0,0 @@
-{"agent_name": "1.0.2", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed", "score": 35.2, "metric": "resolve_rate", "submission_time": "2025-11-25T00:12:22.972368", "tags": ["multi-swe-bench"], "total_cost": 27.6, "total_runtime": 476.0}
-{"agent_name": "1.0.1", "llm_base": "gpt-4o-2024-11-20", "openness": "closed", "score": 32.8, "metric": "resolve_rate", "submission_time": "2025-11-25T00:12:22.972389", "tags": ["multi-swe-bench"], "total_cost": 26.4, "total_runtime": 464.0}
-{"agent_name": "1.0.0", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed", "score": 28.4, "metric": "resolve_rate", "submission_time": "2025-11-25T00:12:22.972400", "tags": ["multi-swe-bench"], "total_cost": 24.2, "total_runtime": 442.0}
-{"agent_name": "0.9.5", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed", "score": 24.1, "metric": "resolve_rate", "submission_time": "2025-11-25T00:12:22.972408", "tags": ["multi-swe-bench"], "total_cost": 22.05, "total_runtime": 420.5}
-{"agent_name": "0.9.0", "llm_base": "claude-3-opus-20240229", "openness": "closed", "score": 21.5, "metric": "resolve_rate", "submission_time": "2025-11-25T00:12:22.972416", "tags": ["multi-swe-bench"], "total_cost": 20.75, "total_runtime": 407.5}

mock_results/1.0.0-dev1/results/20251124_claude_3_5_sonnet_20241022/metadata.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "agent_name": "OpenHands CodeAct v2.1",
+  "agent_version": "OpenHands CodeAct v2.1",
+  "model": "claude-3-5-sonnet-20241022",
+  "openness": "closed_api_available",
+  "tool_usage": "standard",
+  "submission_time": "2025-11-24T19:56:00.092865",
+  "directory_name": "20251124_claude_3_5_sonnet_20241022"
+}

mock_results/1.0.0-dev1/results/20251124_claude_3_5_sonnet_20241022/scores.json ADDED Viewed

	@@ -0,0 +1,62 @@

+[
+  {
+    "benchmark": "swe-bench",
+    "score": 48.3,
+    "metric": "resolve_rate",
+    "total_cost": 34.15,
+    "total_runtime": 541.5,
+    "tags": [
+      "swe-bench"
+    ]
+  },
+  {
+    "benchmark": "swe-bench-multimodal",
+    "score": 42.1,
+    "metric": "resolve_rate",
+    "total_cost": 31.05,
+    "total_runtime": 510.5,
+    "tags": [
+      "swe-bench-multimodal"
+    ]
+  },
+  {
+    "benchmark": "commit0",
+    "score": 71.2,
+    "metric": "test_pass_rate",
+    "total_cost": 45.6,
+    "total_runtime": 656.0,
+    "tags": [
+      "commit0"
+    ]
+  },
+  {
+    "benchmark": "multi-swe-bench",
+    "score": 35.2,
+    "metric": "resolve_rate",
+    "total_cost": 27.6,
+    "total_runtime": 476.0,
+    "tags": [
+      "multi-swe-bench"
+    ]
+  },
+  {
+    "benchmark": "swt-bench",
+    "score": 65.4,
+    "metric": "success_rate",
+    "total_cost": 42.7,
+    "total_runtime": 627.0,
+    "tags": [
+      "swt-bench"
+    ]
+  },
+  {
+    "benchmark": "gaia",
+    "score": 58.7,
+    "metric": "accuracy",
+    "total_cost": 39.35,
+    "total_runtime": 593.5,
+    "tags": [
+      "gaia"
+    ]
+  }
+]

mock_results/1.0.0-dev1/results/20251124_claude_3_opus_20240229/metadata.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "agent_name": "SWE-Agent",
+  "agent_version": "SWE-Agent",
+  "model": "claude-3-opus-20240229",
+  "openness": "closed_api_available",
+  "tool_usage": "custom_interface",
+  "submission_time": "2025-11-24T19:56:00.092922",
+  "directory_name": "20251124_claude_3_opus_20240229"
+}

mock_results/1.0.0-dev1/results/20251124_claude_3_opus_20240229/scores.json ADDED Viewed

	@@ -0,0 +1,62 @@

+[
+  {
+    "benchmark": "swe-bench",
+    "score": 29.8,
+    "metric": "resolve_rate",
+    "total_cost": 24.9,
+    "total_runtime": 449.0,
+    "tags": [
+      "swe-bench"
+    ]
+  },
+  {
+    "benchmark": "swe-bench-multimodal",
+    "score": 25.7,
+    "metric": "resolve_rate",
+    "total_cost": 22.85,
+    "total_runtime": 428.5,
+    "tags": [
+      "swe-bench-multimodal"
+    ]
+  },
+  {
+    "benchmark": "commit0",
+    "score": 52.1,
+    "metric": "test_pass_rate",
+    "total_cost": 36.05,
+    "total_runtime": 560.5,
+    "tags": [
+      "commit0"
+    ]
+  },
+  {
+    "benchmark": "multi-swe-bench",
+    "score": 21.5,
+    "metric": "resolve_rate",
+    "total_cost": 20.75,
+    "total_runtime": 407.5,
+    "tags": [
+      "multi-swe-bench"
+    ]
+  },
+  {
+    "benchmark": "swt-bench",
+    "score": 44.2,
+    "metric": "success_rate",
+    "total_cost": 32.1,
+    "total_runtime": 521.0,
+    "tags": [
+      "swt-bench"
+    ]
+  },
+  {
+    "benchmark": "gaia",
+    "score": 39.4,
+    "metric": "accuracy",
+    "total_cost": 29.7,
+    "total_runtime": 497.0,
+    "tags": [
+      "gaia"
+    ]
+  }
+]

mock_results/1.0.0-dev1/results/20251124_gpt_4_turbo_2024_04_09/metadata.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "agent_name": "AutoCodeRover",
+  "agent_version": "AutoCodeRover",
+  "model": "gpt-4-turbo-2024-04-09",
+  "openness": "closed_api_available",
+  "tool_usage": "standard",
+  "submission_time": "2025-11-24T19:56:00.092908",
+  "directory_name": "20251124_gpt_4_turbo_2024_04_09"
+}

mock_results/1.0.0-dev1/results/20251124_gpt_4_turbo_2024_04_09/scores.json ADDED Viewed

	@@ -0,0 +1,62 @@

+[
+  {
+    "benchmark": "swe-bench",
+    "score": 38.7,
+    "metric": "resolve_rate",
+    "total_cost": 29.35,
+    "total_runtime": 493.5,
+    "tags": [
+      "swe-bench"
+    ]
+  },
+  {
+    "benchmark": "swe-bench-multimodal",
+    "score": 34.2,
+    "metric": "resolve_rate",
+    "total_cost": 27.1,
+    "total_runtime": 471.0,
+    "tags": [
+      "swe-bench-multimodal"
+    ]
+  },
+  {
+    "benchmark": "commit0",
+    "score": 61.5,
+    "metric": "test_pass_rate",
+    "total_cost": 40.75,
+    "total_runtime": 607.5,
+    "tags": [
+      "commit0"
+    ]
+  },
+  {
+    "benchmark": "multi-swe-bench",
+    "score": 28.4,
+    "metric": "resolve_rate",
+    "total_cost": 24.2,
+    "total_runtime": 442.0,
+    "tags": [
+      "multi-swe-bench"
+    ]
+  },
+  {
+    "benchmark": "swt-bench",
+    "score": 54.1,
+    "metric": "success_rate",
+    "total_cost": 37.05,
+    "total_runtime": 570.5,
+    "tags": [
+      "swt-bench"
+    ]
+  },
+  {
+    "benchmark": "gaia",
+    "score": 48.3,
+    "metric": "accuracy",
+    "total_cost": 34.15,
+    "total_runtime": 541.5,
+    "tags": [
+      "gaia"
+    ]
+  }
+]

mock_results/1.0.0-dev1/results/20251124_gpt_4o_2024_11_20/metadata.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "agent_name": "OpenHands CodeAct v2.0",
+  "agent_version": "OpenHands CodeAct v2.0",
+  "model": "gpt-4o-2024-11-20",
+  "openness": "closed_api_available",
+  "tool_usage": "standard",
+  "submission_time": "2025-11-24T19:56:00.092895",
+  "directory_name": "20251124_gpt_4o_2024_11_20"
+}

mock_results/1.0.0-dev1/results/20251124_gpt_4o_2024_11_20/scores.json ADDED Viewed

	@@ -0,0 +1,62 @@

+[
+  {
+    "benchmark": "swe-bench",
+    "score": 45.1,
+    "metric": "resolve_rate",
+    "total_cost": 32.55,
+    "total_runtime": 525.5,
+    "tags": [
+      "swe-bench"
+    ]
+  },
+  {
+    "benchmark": "swe-bench-multimodal",
+    "score": 39.5,
+    "metric": "resolve_rate",
+    "total_cost": 29.75,
+    "total_runtime": 497.5,
+    "tags": [
+      "swe-bench-multimodal"
+    ]
+  },
+  {
+    "benchmark": "commit0",
+    "score": 68.9,
+    "metric": "test_pass_rate",
+    "total_cost": 44.45,
+    "total_runtime": 644.5,
+    "tags": [
+      "commit0"
+    ]
+  },
+  {
+    "benchmark": "multi-swe-bench",
+    "score": 32.8,
+    "metric": "resolve_rate",
+    "total_cost": 26.4,
+    "total_runtime": 464.0,
+    "tags": [
+      "multi-swe-bench"
+    ]
+  },
+  {
+    "benchmark": "swt-bench",
+    "score": 62.3,
+    "metric": "success_rate",
+    "total_cost": 41.15,
+    "total_runtime": 611.5,
+    "tags": [
+      "swt-bench"
+    ]
+  },
+  {
+    "benchmark": "gaia",
+    "score": 55.2,
+    "metric": "accuracy",
+    "total_cost": 37.6,
+    "total_runtime": 576.0,
+    "tags": [
+      "gaia"
+    ]
+  }
+]

mock_results/1.0.0-dev1/results/20251124_gpt_4o_mini_2024_07_18/metadata.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "agent_name": "Agentless",
+  "agent_version": "Agentless",
+  "model": "gpt-4o-mini-2024-07-18",
+  "openness": "closed_api_available",
+  "tool_usage": "standard",
+  "submission_time": "2025-11-24T19:56:00.092916",
+  "directory_name": "20251124_gpt_4o_mini_2024_07_18"
+}

mock_results/1.0.0-dev1/results/20251124_gpt_4o_mini_2024_07_18/scores.json ADDED Viewed

	@@ -0,0 +1,62 @@

+[
+  {
+    "benchmark": "swe-bench",
+    "score": 32.5,
+    "metric": "resolve_rate",
+    "total_cost": 26.25,
+    "total_runtime": 462.5,
+    "tags": [
+      "swe-bench"
+    ]
+  },
+  {
+    "benchmark": "swe-bench-multimodal",
+    "score": 28.9,
+    "metric": "resolve_rate",
+    "total_cost": 24.45,
+    "total_runtime": 444.5,
+    "tags": [
+      "swe-bench-multimodal"
+    ]
+  },
+  {
+    "benchmark": "commit0",
+    "score": 55.3,
+    "metric": "test_pass_rate",
+    "total_cost": 37.65,
+    "total_runtime": 576.5,
+    "tags": [
+      "commit0"
+    ]
+  },
+  {
+    "benchmark": "multi-swe-bench",
+    "score": 24.1,
+    "metric": "resolve_rate",
+    "total_cost": 22.05,
+    "total_runtime": 420.5,
+    "tags": [
+      "multi-swe-bench"
+    ]
+  },
+  {
+    "benchmark": "swt-bench",
+    "score": 47.8,
+    "metric": "success_rate",
+    "total_cost": 33.9,
+    "total_runtime": 539.0,
+    "tags": [
+      "swt-bench"
+    ]
+  },
+  {
+    "benchmark": "gaia",
+    "score": 42.1,
+    "metric": "accuracy",
+    "total_cost": 31.05,
+    "total_runtime": 510.5,
+    "tags": [
+      "gaia"
+    ]
+  }
+]

mock_results/1.0.0-dev1/swe-bench-multimodal.jsonl DELETED Viewed

@@ -1,5 +0,0 @@
-{"agent_name": "1.0.2", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed", "score": 42.1, "metric": "resolve_rate", "submission_time": "2025-11-25T00:12:22.972550", "tags": ["swe-bench-multimodal"], "total_cost": 31.05, "total_runtime": 510.5}
-{"agent_name": "1.0.1", "llm_base": "gpt-4o-2024-11-20", "openness": "closed", "score": 39.5, "metric": "resolve_rate", "submission_time": "2025-11-25T00:12:22.972567", "tags": ["swe-bench-multimodal"], "total_cost": 29.75, "total_runtime": 497.5}
-{"agent_name": "1.0.0", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed", "score": 34.2, "metric": "resolve_rate", "submission_time": "2025-11-25T00:12:22.972577", "tags": ["swe-bench-multimodal"], "total_cost": 27.1, "total_runtime": 471.0}
-{"agent_name": "0.9.5", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed", "score": 28.9, "metric": "resolve_rate", "submission_time": "2025-11-25T00:12:22.972585", "tags": ["swe-bench-multimodal"], "total_cost": 24.45, "total_runtime": 444.5}
-{"agent_name": "0.9.0", "llm_base": "claude-3-opus-20240229", "openness": "closed", "score": 25.7, "metric": "resolve_rate", "submission_time": "2025-11-25T00:12:22.972593", "tags": ["swe-bench-multimodal"], "total_cost": 22.85, "total_runtime": 428.5}

mock_results/1.0.0-dev1/swe-bench.jsonl DELETED Viewed

@@ -1,5 +0,0 @@
-{"agent_name": "1.0.2", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed", "score": 48.3, "metric": "resolve_rate", "submission_time": "2025-11-25T00:12:22.972101", "tags": ["swe-bench"], "total_cost": 34.15, "total_runtime": 541.5}
-{"agent_name": "1.0.1", "llm_base": "gpt-4o-2024-11-20", "openness": "closed", "score": 45.1, "metric": "resolve_rate", "submission_time": "2025-11-25T00:12:22.972136", "tags": ["swe-bench"], "total_cost": 32.55, "total_runtime": 525.5}
-{"agent_name": "1.0.0", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed", "score": 38.7, "metric": "resolve_rate", "submission_time": "2025-11-25T00:12:22.972167", "tags": ["swe-bench"], "total_cost": 29.35, "total_runtime": 493.5}
-{"agent_name": "0.9.5", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed", "score": 32.5, "metric": "resolve_rate", "submission_time": "2025-11-25T00:12:22.972178", "tags": ["swe-bench"], "total_cost": 26.25, "total_runtime": 462.5}
-{"agent_name": "0.9.0", "llm_base": "claude-3-opus-20240229", "openness": "closed", "score": 29.8, "metric": "resolve_rate", "submission_time": "2025-11-25T00:12:22.972186", "tags": ["swe-bench"], "total_cost": 24.9, "total_runtime": 449.0}

mock_results/1.0.0-dev1/swt-bench.jsonl DELETED Viewed

@@ -1,5 +0,0 @@
-{"agent_name": "1.0.2", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed", "score": 65.4, "metric": "success_rate", "submission_time": "2025-11-25T00:12:22.972724", "tags": ["swt-bench"], "total_cost": 42.7, "total_runtime": 627.0}
-{"agent_name": "1.0.1", "llm_base": "gpt-4o-2024-11-20", "openness": "closed", "score": 62.3, "metric": "success_rate", "submission_time": "2025-11-25T00:12:22.972741", "tags": ["swt-bench"], "total_cost": 41.15, "total_runtime": 611.5}
-{"agent_name": "1.0.0", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed", "score": 54.1, "metric": "success_rate", "submission_time": "2025-11-25T00:12:22.972750", "tags": ["swt-bench"], "total_cost": 37.05, "total_runtime": 570.5}
-{"agent_name": "0.9.5", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed", "score": 47.8, "metric": "success_rate", "submission_time": "2025-11-25T00:12:22.972758", "tags": ["swt-bench"], "total_cost": 33.9, "total_runtime": 539.0}
-{"agent_name": "0.9.0", "llm_base": "claude-3-opus-20240229", "openness": "closed", "score": 44.2, "metric": "success_rate", "submission_time": "2025-11-25T00:12:22.972765", "tags": ["swt-bench"], "total_cost": 32.1, "total_runtime": 521.0}

mock_results/1.0.0-dev1/test.jsonl DELETED Viewed

@@ -1,30 +0,0 @@
-{"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 48.3, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.092865", "tags": ["swe-bench"], "total_cost": 34.15, "total_runtime": 541.5}
-{"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 45.1, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.092895", "tags": ["swe-bench"], "total_cost": 32.55, "total_runtime": 525.5}
-{"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 38.7, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.092908", "tags": ["swe-bench"], "total_cost": 29.35, "total_runtime": 493.5}
-{"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 32.5, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.092916", "tags": ["swe-bench"], "total_cost": 26.25, "total_runtime": 462.5}
-{"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 29.8, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.092922", "tags": ["swe-bench"], "total_cost": 24.9, "total_runtime": 449.0}
-{"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 35.2, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093026", "tags": ["multi-swe-bench"], "total_cost": 27.6, "total_runtime": 476.0}
-{"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 32.8, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093040", "tags": ["multi-swe-bench"], "total_cost": 26.4, "total_runtime": 464.0}
-{"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 28.4, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093048", "tags": ["multi-swe-bench"], "total_cost": 24.2, "total_runtime": 442.0}
-{"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 24.1, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093058", "tags": ["multi-swe-bench"], "total_cost": 22.05, "total_runtime": 420.5}
-{"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 21.5, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093067", "tags": ["multi-swe-bench"], "total_cost": 20.75, "total_runtime": 407.5}
-{"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 42.1, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093172", "tags": ["swe-bench-multimodal"], "total_cost": 31.05, "total_runtime": 510.5}
-{"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 39.5, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093187", "tags": ["swe-bench-multimodal"], "total_cost": 29.75, "total_runtime": 497.5}
-{"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 34.2, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093200", "tags": ["swe-bench-multimodal"], "total_cost": 27.1, "total_runtime": 471.0}
-{"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 28.9, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093208", "tags": ["swe-bench-multimodal"], "total_cost": 24.45, "total_runtime": 444.5}
-{"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 25.7, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093218", "tags": ["swe-bench-multimodal"], "total_cost": 22.85, "total_runtime": 428.5}
-{"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 65.4, "metric": "success_rate", "submission_time": "2025-11-24T19:56:00.093304", "tags": ["swt-bench"], "total_cost": 42.7, "total_runtime": 627.0}
-{"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 62.3, "metric": "success_rate", "submission_time": "2025-11-24T19:56:00.093317", "tags": ["swt-bench"], "total_cost": 41.15, "total_runtime": 611.5}
-{"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 54.1, "metric": "success_rate", "submission_time": "2025-11-24T19:56:00.093326", "tags": ["swt-bench"], "total_cost": 37.05, "total_runtime": 570.5}
-{"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 47.8, "metric": "success_rate", "submission_time": "2025-11-24T19:56:00.093333", "tags": ["swt-bench"], "total_cost": 33.9, "total_runtime": 539.0}
-{"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 44.2, "metric": "success_rate", "submission_time": "2025-11-24T19:56:00.093338", "tags": ["swt-bench"], "total_cost": 32.1, "total_runtime": 521.0}
-{"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 71.2, "metric": "test_pass_rate", "submission_time": "2025-11-24T19:56:00.093416", "tags": ["commit0"], "total_cost": 45.6, "total_runtime": 656.0}
-{"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 68.9, "metric": "test_pass_rate", "submission_time": "2025-11-24T19:56:00.093428", "tags": ["commit0"], "total_cost": 44.45, "total_runtime": 644.5}
-{"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 61.5, "metric": "test_pass_rate", "submission_time": "2025-11-24T19:56:00.093436", "tags": ["commit0"], "total_cost": 40.75, "total_runtime": 607.5}
-{"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 55.3, "metric": "test_pass_rate", "submission_time": "2025-11-24T19:56:00.093445", "tags": ["commit0"], "total_cost": 37.65, "total_runtime": 576.5}
-{"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 52.1, "metric": "test_pass_rate", "submission_time": "2025-11-24T19:56:00.093450", "tags": ["commit0"], "total_cost": 36.05, "total_runtime": 560.5}
-{"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 58.7, "metric": "accuracy", "submission_time": "2025-11-24T19:56:00.093541", "tags": ["gaia"], "total_cost": 39.35, "total_runtime": 593.5}
-{"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 55.2, "metric": "accuracy", "submission_time": "2025-11-24T19:56:00.093556", "tags": ["gaia"], "total_cost": 37.6, "total_runtime": 576.0}
-{"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 48.3, "metric": "accuracy", "submission_time": "2025-11-24T19:56:00.093563", "tags": ["gaia"], "total_cost": 34.15, "total_runtime": 541.5}
-{"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 42.1, "metric": "accuracy", "submission_time": "2025-11-24T19:56:00.093568", "tags": ["gaia"], "total_cost": 31.05, "total_runtime": 510.5}
-{"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 39.4, "metric": "accuracy", "submission_time": "2025-11-24T19:56:00.093574", "tags": ["gaia"], "total_cost": 29.7, "total_runtime": 497.0}

mock_results/1.0.0-dev1/test.parquet DELETED Viewed

Binary file (9.26 kB)

mock_results/1.0.0-dev1/validation.jsonl DELETED Viewed

@@ -1,30 +0,0 @@
-{"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 48.3, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.092865", "tags": ["swe-bench"], "total_cost": 34.15, "total_runtime": 541.5}
-{"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 45.1, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.092895", "tags": ["swe-bench"], "total_cost": 32.55, "total_runtime": 525.5}
-{"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 38.7, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.092908", "tags": ["swe-bench"], "total_cost": 29.35, "total_runtime": 493.5}
-{"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 32.5, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.092916", "tags": ["swe-bench"], "total_cost": 26.25, "total_runtime": 462.5}
-{"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 29.8, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.092922", "tags": ["swe-bench"], "total_cost": 24.9, "total_runtime": 449.0}
-{"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 35.2, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093026", "tags": ["multi-swe-bench"], "total_cost": 27.6, "total_runtime": 476.0}
-{"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 32.8, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093040", "tags": ["multi-swe-bench"], "total_cost": 26.4, "total_runtime": 464.0}
-{"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 28.4, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093048", "tags": ["multi-swe-bench"], "total_cost": 24.2, "total_runtime": 442.0}
-{"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 24.1, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093058", "tags": ["multi-swe-bench"], "total_cost": 22.05, "total_runtime": 420.5}
-{"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 21.5, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093067", "tags": ["multi-swe-bench"], "total_cost": 20.75, "total_runtime": 407.5}
-{"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 42.1, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093172", "tags": ["swe-bench-multimodal"], "total_cost": 31.05, "total_runtime": 510.5}
-{"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 39.5, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093187", "tags": ["swe-bench-multimodal"], "total_cost": 29.75, "total_runtime": 497.5}
-{"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 34.2, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093200", "tags": ["swe-bench-multimodal"], "total_cost": 27.1, "total_runtime": 471.0}
-{"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 28.9, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093208", "tags": ["swe-bench-multimodal"], "total_cost": 24.45, "total_runtime": 444.5}
-{"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 25.7, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093218", "tags": ["swe-bench-multimodal"], "total_cost": 22.85, "total_runtime": 428.5}
-{"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 65.4, "metric": "success_rate", "submission_time": "2025-11-24T19:56:00.093304", "tags": ["swt-bench"], "total_cost": 42.7, "total_runtime": 627.0}
-{"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 62.3, "metric": "success_rate", "submission_time": "2025-11-24T19:56:00.093317", "tags": ["swt-bench"], "total_cost": 41.15, "total_runtime": 611.5}
-{"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 54.1, "metric": "success_rate", "submission_time": "2025-11-24T19:56:00.093326", "tags": ["swt-bench"], "total_cost": 37.05, "total_runtime": 570.5}
-{"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 47.8, "metric": "success_rate", "submission_time": "2025-11-24T19:56:00.093333", "tags": ["swt-bench"], "total_cost": 33.9, "total_runtime": 539.0}
-{"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 44.2, "metric": "success_rate", "submission_time": "2025-11-24T19:56:00.093338", "tags": ["swt-bench"], "total_cost": 32.1, "total_runtime": 521.0}
-{"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 71.2, "metric": "test_pass_rate", "submission_time": "2025-11-24T19:56:00.093416", "tags": ["commit0"], "total_cost": 45.6, "total_runtime": 656.0}
-{"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 68.9, "metric": "test_pass_rate", "submission_time": "2025-11-24T19:56:00.093428", "tags": ["commit0"], "total_cost": 44.45, "total_runtime": 644.5}
-{"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 61.5, "metric": "test_pass_rate", "submission_time": "2025-11-24T19:56:00.093436", "tags": ["commit0"], "total_cost": 40.75, "total_runtime": 607.5}
-{"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 55.3, "metric": "test_pass_rate", "submission_time": "2025-11-24T19:56:00.093445", "tags": ["commit0"], "total_cost": 37.65, "total_runtime": 576.5}
-{"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 52.1, "metric": "test_pass_rate", "submission_time": "2025-11-24T19:56:00.093450", "tags": ["commit0"], "total_cost": 36.05, "total_runtime": 560.5}
-{"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 58.7, "metric": "accuracy", "submission_time": "2025-11-24T19:56:00.093541", "tags": ["gaia"], "total_cost": 39.35, "total_runtime": 593.5}
-{"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 55.2, "metric": "accuracy", "submission_time": "2025-11-24T19:56:00.093556", "tags": ["gaia"], "total_cost": 37.6, "total_runtime": 576.0}
-{"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 48.3, "metric": "accuracy", "submission_time": "2025-11-24T19:56:00.093563", "tags": ["gaia"], "total_cost": 34.15, "total_runtime": 541.5}
-{"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 42.1, "metric": "accuracy", "submission_time": "2025-11-24T19:56:00.093568", "tags": ["gaia"], "total_cost": 31.05, "total_runtime": 510.5}
-{"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 39.4, "metric": "accuracy", "submission_time": "2025-11-24T19:56:00.093574", "tags": ["gaia"], "total_cost": 29.7, "total_runtime": 497.0}

mock_results/1.0.0-dev1/validation.parquet DELETED Viewed

Binary file (9.29 kB)

requirements.txt CHANGED Viewed

@@ -4,7 +4,6 @@ pandas==2.2.3
 plotly==6.0.1
 requests==2.32.3
 huggingface-hub==0.30.2
-APScheduler==3.11.0
 # Additional dependencies for UI and processing
 matplotlib==3.10.3

 plotly==6.0.1
 requests==2.32.3
 huggingface-hub==0.30.2
 # Additional dependencies for UI and processing
 matplotlib==3.10.3

setup_data.py CHANGED Viewed

@@ -44,33 +44,29 @@ def fetch_data_from_github():
             return False
         # Look for data files in the cloned repository
-        # Expected structure: openhands-index-results/{version}/test.jsonl, validation.jsonl, etc.
-        data_source = temp_clone_dir / CONFIG_NAME
-        if not data_source.exists():
-            print(f"Data directory {data_source} not found in repository")
-            # Try to find any version directories
-            version_dirs = list(temp_clone_dir.glob("*.*.*"))
-            if version_dirs:
-                print(f"Found version directories: {[d.name for d in version_dirs]}")
-                # Use the first available version
-                data_source = version_dirs[0]
-                print(f"Using data from {data_source.name}")
-            else:
-                print("No data found in repository")
-                return False
-        # Check if there are any JSONL files
-        jsonl_files = list(data_source.glob("*.jsonl"))
-        if not jsonl_files:
-            print(f"No JSONL files found in {data_source}")
             return False
-        # Create target directory and copy data
         os.makedirs(target_dir.parent, exist_ok=True)
         if target_dir.exists():
             shutil.rmtree(target_dir)
-        shutil.copytree(data_source, target_dir)
         print(f"Successfully fetched data from GitHub. Files: {list(target_dir.glob('*'))}")
         return True

             return False
         # Look for data files in the cloned repository
+        # Expected structure: openhands-index-results/results/YYYYMMDD_model/
+        results_source = temp_clone_dir / "results"
+        if not results_source.exists():
+            print(f"Results directory not found in repository")
+            return False
+        # Check if there are any agent result directories
+        result_dirs = list(results_source.iterdir())
+        if not result_dirs:
+            print(f"No agent results found in {results_source}")
             return False
+        print(f"Found {len(result_dirs)} agent result directories")
+        # Create target directory and copy the results structure
         os.makedirs(target_dir.parent, exist_ok=True)
         if target_dir.exists():
             shutil.rmtree(target_dir)
+        # Copy the entire results directory
+        target_results = target_dir / "results"
+        shutil.copytree(results_source, target_results)
         print(f"Successfully fetched data from GitHub. Files: {list(target_dir.glob('*'))}")
         return True

simple_data_loader.py CHANGED Viewed

@@ -103,43 +103,16 @@ class SimpleLeaderboardViewer:
         return pd.DataFrame(all_records)
     def _load(self):
-        """Load the JSONL file for the split and return DataFrame and tag map."""
-        # Try new format first (agent-centric directories)
         df = self._load_from_agent_dirs()
         if df is None:
-            # Fall back to old format (benchmark-centric JSONL)
-            jsonl_file = self.config_path / f"{self.split}.jsonl"
-            if not jsonl_file.exists():
-                # Return empty dataframe with error message
-                return pd.DataFrame({
-                    "Message": [f"No data found for split '{self.split}'. Expected file: {jsonl_file}"]
-                }), {}
-            try:
-                # Read JSONL file
-                records = []
-                with open(jsonl_file, 'r') as f:
-                    for line in f:
-                        if line.strip():
-                            records.append(json.loads(line))
-                if not records:
-                    return pd.DataFrame({
-                        "Message": [f"No data in file: {jsonl_file}"]
-                    }), {}
-                # Convert to DataFrame
-                df = pd.DataFrame(records)
-            except Exception as e:
-                import traceback
-                traceback.print_exc()
-                return pd.DataFrame({
-                    "Message": [f"Error loading data: {e}"]
-                }), {}
-        # Now process the dataframe (works for both old and new format)
         try:
             # Transform to expected format for leaderboard

         return pd.DataFrame(all_records)
     def _load(self):
+        """Load data from agent-centric directories and return DataFrame and tag map."""
         df = self._load_from_agent_dirs()
         if df is None:
+            # Return empty dataframe with error message
+            return pd.DataFrame({
+                "Message": [f"No data found for split '{self.split}' in results directory"]
+            }), {}
+        # Process the dataframe
         try:
             # Transform to expected format for leaderboard