Spaces:
Running
Running
openhands
openhands
commited on
Commit
·
855423e
1
Parent(s):
64c8899
Cleanup codebase: remove unused code, simplify data loading, and add pre-release notice
Browse files- Remove unused github_data_loader.py (71 lines of dead code)
- Clean up debug logging in app.py (replaced ~40 print statements with proper logger)
- Remove scheduler code and APScheduler dependency from requirements.txt
- Simplify simple_data_loader.py by removing old JSONL format fallback
- Update setup_data.py to fetch data from results/ directory only
- Update mock data to use new agent-centric directory structure
- Add pre-release notice to main page intro paragraph
Co-authored-by: openhands <[email protected]>
- app.py +14 -68
- content.py +4 -0
- github_data_loader.py +0 -71
- mock_results/1.0.0-dev1/agenteval.json +0 -74
- mock_results/1.0.0-dev1/commit0.jsonl +0 -5
- mock_results/1.0.0-dev1/gaia.jsonl +0 -5
- mock_results/1.0.0-dev1/multi-swe-bench.jsonl +0 -5
- mock_results/1.0.0-dev1/results/20251124_claude_3_5_sonnet_20241022/metadata.json +9 -0
- mock_results/1.0.0-dev1/results/20251124_claude_3_5_sonnet_20241022/scores.json +62 -0
- mock_results/1.0.0-dev1/results/20251124_claude_3_opus_20240229/metadata.json +9 -0
- mock_results/1.0.0-dev1/results/20251124_claude_3_opus_20240229/scores.json +62 -0
- mock_results/1.0.0-dev1/results/20251124_gpt_4_turbo_2024_04_09/metadata.json +9 -0
- mock_results/1.0.0-dev1/results/20251124_gpt_4_turbo_2024_04_09/scores.json +62 -0
- mock_results/1.0.0-dev1/results/20251124_gpt_4o_2024_11_20/metadata.json +9 -0
- mock_results/1.0.0-dev1/results/20251124_gpt_4o_2024_11_20/scores.json +62 -0
- mock_results/1.0.0-dev1/results/20251124_gpt_4o_mini_2024_07_18/metadata.json +9 -0
- mock_results/1.0.0-dev1/results/20251124_gpt_4o_mini_2024_07_18/scores.json +62 -0
- mock_results/1.0.0-dev1/swe-bench-multimodal.jsonl +0 -5
- mock_results/1.0.0-dev1/swe-bench.jsonl +0 -5
- mock_results/1.0.0-dev1/swt-bench.jsonl +0 -5
- mock_results/1.0.0-dev1/test.jsonl +0 -30
- mock_results/1.0.0-dev1/test.parquet +0 -0
- mock_results/1.0.0-dev1/validation.jsonl +0 -30
- mock_results/1.0.0-dev1/validation.parquet +0 -0
- requirements.txt +0 -1
- setup_data.py +16 -20
- simple_data_loader.py +6 -33
app.py
CHANGED
|
@@ -2,56 +2,34 @@
|
|
| 2 |
import logging
|
| 3 |
import sys
|
| 4 |
|
| 5 |
-
logging.basicConfig(level=logging.INFO)
|
|
|
|
| 6 |
|
| 7 |
-
|
| 8 |
-
print("STARTING APP.PY", file=sys.stderr)
|
| 9 |
-
print("=" * 80, file=sys.stderr)
|
| 10 |
|
| 11 |
# Setup mock data before anything else
|
| 12 |
try:
|
| 13 |
-
print("Importing setup_data module...", file=sys.stderr)
|
| 14 |
from setup_data import setup_mock_data
|
| 15 |
-
print("Calling setup_mock_data()...", file=sys.stderr)
|
| 16 |
setup_mock_data()
|
| 17 |
-
|
| 18 |
except Exception as e:
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
traceback.print_exc()
|
| 22 |
-
print("Continuing with app startup despite error...", file=sys.stderr)
|
| 23 |
|
| 24 |
-
print("Importing gradio...", file=sys.stderr)
|
| 25 |
import gradio as gr
|
| 26 |
-
print("✓ Gradio imported", file=sys.stderr)
|
| 27 |
import urllib.parse
|
| 28 |
-
|
| 29 |
-
print("Importing dependencies...", file=sys.stderr)
|
| 30 |
-
from apscheduler.schedulers.background import BackgroundScheduler
|
| 31 |
from huggingface_hub import HfApi
|
| 32 |
-
print("✓ Dependencies imported", file=sys.stderr)
|
| 33 |
-
|
| 34 |
-
print("Importing config...", file=sys.stderr)
|
| 35 |
from config import LEADERBOARD_PATH, LOCAL_DEBUG
|
| 36 |
-
print(f"✓ Config imported (LOCAL_DEBUG={LOCAL_DEBUG})", file=sys.stderr)
|
| 37 |
-
|
| 38 |
-
print("Importing content and pages...", file=sys.stderr)
|
| 39 |
from content import css
|
| 40 |
-
print("✓ css imported", file=sys.stderr)
|
| 41 |
from main_page import build_page as build_main_page
|
| 42 |
-
print("✓ main_page imported", file=sys.stderr)
|
| 43 |
from bug_fixing import build_page as build_bug_fixing_page
|
| 44 |
-
print("✓ bug_fixing imported", file=sys.stderr)
|
| 45 |
from app_creation import build_page as build_app_creation_page
|
| 46 |
-
print("✓ app_creation imported", file=sys.stderr)
|
| 47 |
from frontend_development import build_page as build_frontend_page
|
| 48 |
-
print("✓ frontend_development imported", file=sys.stderr)
|
| 49 |
from test_generation import build_page as build_test_generation_page
|
| 50 |
-
print("✓ test_generation imported", file=sys.stderr)
|
| 51 |
from information_gathering import build_page as build_information_gathering_page
|
| 52 |
-
print("✓ information_gathering imported", file=sys.stderr)
|
| 53 |
from about import build_page as build_about_page
|
| 54 |
-
|
|
|
|
| 55 |
|
| 56 |
api = HfApi()
|
| 57 |
LOGO_PATH = "assets/logo.svg"
|
|
@@ -193,7 +171,7 @@ try:
|
|
| 193 |
encoded_svg = urllib.parse.quote(svg_content)
|
| 194 |
home_icon_data_uri = f"data:image/svg+xml,{encoded_svg}"
|
| 195 |
except FileNotFoundError:
|
| 196 |
-
|
| 197 |
home_icon_data_uri = "none"
|
| 198 |
|
| 199 |
# --- This is the final CSS ---
|
|
@@ -228,77 +206,45 @@ final_css = css + f"""
|
|
| 228 |
}}
|
| 229 |
"""
|
| 230 |
# --- Gradio App Definition ---
|
| 231 |
-
|
| 232 |
demo = gr.Blocks(
|
| 233 |
theme=theme,
|
| 234 |
css=final_css,
|
| 235 |
head=scroll_script + redirect_script + tooltip_script,
|
| 236 |
title="OpenHands Index",
|
| 237 |
)
|
| 238 |
-
print("✓ Gradio Blocks created", file=sys.stderr)
|
| 239 |
|
| 240 |
-
print("Building Home page route...", file=sys.stderr)
|
| 241 |
with demo.route("Home", "/home"):
|
| 242 |
build_main_page()
|
| 243 |
-
print("✓ Home page built", file=sys.stderr)
|
| 244 |
|
| 245 |
-
print("Building Bug Fixing page route...", file=sys.stderr)
|
| 246 |
with demo.route("Bug Fixing", "/bug-fixing"):
|
| 247 |
build_bug_fixing_page()
|
| 248 |
-
print("✓ Bug Fixing page built", file=sys.stderr)
|
| 249 |
|
| 250 |
-
print("Building App Creation page route...", file=sys.stderr)
|
| 251 |
with demo.route("App Creation", "/app-creation"):
|
| 252 |
build_app_creation_page()
|
| 253 |
-
print("✓ App Creation page built", file=sys.stderr)
|
| 254 |
|
| 255 |
-
print("Building Frontend Development page route...", file=sys.stderr)
|
| 256 |
with demo.route("Frontend Development", "/frontend-development"):
|
| 257 |
build_frontend_page()
|
| 258 |
-
print("✓ Frontend Development page built", file=sys.stderr)
|
| 259 |
|
| 260 |
-
print("Building Test Generation page route...", file=sys.stderr)
|
| 261 |
with demo.route("Test Generation", "/test-generation"):
|
| 262 |
build_test_generation_page()
|
| 263 |
-
print("✓ Test Generation page built", file=sys.stderr)
|
| 264 |
|
| 265 |
-
print("Building Information Gathering page route...", file=sys.stderr)
|
| 266 |
with demo.route("Information Gathering", "/information-gathering"):
|
| 267 |
build_information_gathering_page()
|
| 268 |
-
print("✓ Information Gathering page built", file=sys.stderr)
|
| 269 |
|
| 270 |
-
print("Building About page route...", file=sys.stderr)
|
| 271 |
with demo.route("About", "/about"):
|
| 272 |
build_about_page()
|
| 273 |
-
print("✓ About page built", file=sys.stderr)
|
| 274 |
-
|
| 275 |
-
# --- Scheduler and Launch
|
| 276 |
-
def restart_space_job():
|
| 277 |
-
print("Scheduler: Attempting to restart space.")
|
| 278 |
-
try:
|
| 279 |
-
api.restart_space(repo_id=LEADERBOARD_PATH)
|
| 280 |
-
print("Scheduler: Space restart request sent.")
|
| 281 |
-
except Exception as e:
|
| 282 |
-
print(f"Scheduler: Error restarting space: {e}")
|
| 283 |
|
| 284 |
-
|
| 285 |
-
# scheduler = BackgroundScheduler(timezone="UTC")
|
| 286 |
-
# scheduler.add_job(restart_space_job, "interval", hours=1)
|
| 287 |
-
# scheduler.start()
|
| 288 |
|
| 289 |
|
| 290 |
# Launch the Gradio app
|
| 291 |
if __name__ == "__main__":
|
| 292 |
-
print("=" * 80, file=sys.stderr)
|
| 293 |
-
print("READY TO LAUNCH GRADIO APP", file=sys.stderr)
|
| 294 |
-
print("=" * 80, file=sys.stderr)
|
| 295 |
if LOCAL_DEBUG:
|
| 296 |
-
|
| 297 |
demo.launch(debug=True, allowed_paths=["assets"], favicon_path="assets/favicon/favicon.ico")
|
| 298 |
else:
|
| 299 |
-
|
| 300 |
-
# For Spaces, share=False is typical unless specific tunneling is needed.
|
| 301 |
-
# debug=True can be set to False for a "production" Space.
|
| 302 |
demo.launch(server_name="0.0.0.0", server_port=7860, debug=True, share=False, allowed_paths=["assets"], favicon_path="assets/favicon/favicon.ico")
|
| 303 |
-
|
| 304 |
|
|
|
|
| 2 |
import logging
|
| 3 |
import sys
|
| 4 |
|
| 5 |
+
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
|
| 6 |
+
logger = logging.getLogger(__name__)
|
| 7 |
|
| 8 |
+
logger.info("Starting OpenHands Index application")
|
|
|
|
|
|
|
| 9 |
|
| 10 |
# Setup mock data before anything else
|
| 11 |
try:
|
|
|
|
| 12 |
from setup_data import setup_mock_data
|
|
|
|
| 13 |
setup_mock_data()
|
| 14 |
+
logger.info("Data setup completed successfully")
|
| 15 |
except Exception as e:
|
| 16 |
+
logger.error(f"Error during data setup: {e}", exc_info=True)
|
| 17 |
+
logger.warning("Continuing with app startup despite error")
|
|
|
|
|
|
|
| 18 |
|
|
|
|
| 19 |
import gradio as gr
|
|
|
|
| 20 |
import urllib.parse
|
|
|
|
|
|
|
|
|
|
| 21 |
from huggingface_hub import HfApi
|
|
|
|
|
|
|
|
|
|
| 22 |
from config import LEADERBOARD_PATH, LOCAL_DEBUG
|
|
|
|
|
|
|
|
|
|
| 23 |
from content import css
|
|
|
|
| 24 |
from main_page import build_page as build_main_page
|
|
|
|
| 25 |
from bug_fixing import build_page as build_bug_fixing_page
|
|
|
|
| 26 |
from app_creation import build_page as build_app_creation_page
|
|
|
|
| 27 |
from frontend_development import build_page as build_frontend_page
|
|
|
|
| 28 |
from test_generation import build_page as build_test_generation_page
|
|
|
|
| 29 |
from information_gathering import build_page as build_information_gathering_page
|
|
|
|
| 30 |
from about import build_page as build_about_page
|
| 31 |
+
|
| 32 |
+
logger.info(f"All modules imported (LOCAL_DEBUG={LOCAL_DEBUG})")
|
| 33 |
|
| 34 |
api = HfApi()
|
| 35 |
LOGO_PATH = "assets/logo.svg"
|
|
|
|
| 171 |
encoded_svg = urllib.parse.quote(svg_content)
|
| 172 |
home_icon_data_uri = f"data:image/svg+xml,{encoded_svg}"
|
| 173 |
except FileNotFoundError:
|
| 174 |
+
logger.warning(f"Home icon file not found at {LOGO_PATH}")
|
| 175 |
home_icon_data_uri = "none"
|
| 176 |
|
| 177 |
# --- This is the final CSS ---
|
|
|
|
| 206 |
}}
|
| 207 |
"""
|
| 208 |
# --- Gradio App Definition ---
|
| 209 |
+
logger.info("Creating Gradio application")
|
| 210 |
demo = gr.Blocks(
|
| 211 |
theme=theme,
|
| 212 |
css=final_css,
|
| 213 |
head=scroll_script + redirect_script + tooltip_script,
|
| 214 |
title="OpenHands Index",
|
| 215 |
)
|
|
|
|
| 216 |
|
|
|
|
| 217 |
with demo.route("Home", "/home"):
|
| 218 |
build_main_page()
|
|
|
|
| 219 |
|
|
|
|
| 220 |
with demo.route("Bug Fixing", "/bug-fixing"):
|
| 221 |
build_bug_fixing_page()
|
|
|
|
| 222 |
|
|
|
|
| 223 |
with demo.route("App Creation", "/app-creation"):
|
| 224 |
build_app_creation_page()
|
|
|
|
| 225 |
|
|
|
|
| 226 |
with demo.route("Frontend Development", "/frontend-development"):
|
| 227 |
build_frontend_page()
|
|
|
|
| 228 |
|
|
|
|
| 229 |
with demo.route("Test Generation", "/test-generation"):
|
| 230 |
build_test_generation_page()
|
|
|
|
| 231 |
|
|
|
|
| 232 |
with demo.route("Information Gathering", "/information-gathering"):
|
| 233 |
build_information_gathering_page()
|
|
|
|
| 234 |
|
|
|
|
| 235 |
with demo.route("About", "/about"):
|
| 236 |
build_about_page()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 237 |
|
| 238 |
+
logger.info("All routes configured")
|
|
|
|
|
|
|
|
|
|
| 239 |
|
| 240 |
|
| 241 |
# Launch the Gradio app
|
| 242 |
if __name__ == "__main__":
|
|
|
|
|
|
|
|
|
|
| 243 |
if LOCAL_DEBUG:
|
| 244 |
+
logger.info("Launching in LOCAL_DEBUG mode")
|
| 245 |
demo.launch(debug=True, allowed_paths=["assets"], favicon_path="assets/favicon/favicon.ico")
|
| 246 |
else:
|
| 247 |
+
logger.info("Launching in Space mode")
|
|
|
|
|
|
|
| 248 |
demo.launch(server_name="0.0.0.0", server_port=7860, debug=True, share=False, allowed_paths=["assets"], favicon_path="assets/favicon/favicon.ico")
|
| 249 |
+
logger.info("Gradio app launched successfully")
|
| 250 |
|
content.py
CHANGED
|
@@ -16,6 +16,10 @@ def create_gradio_anchor_id(text: str, validation) -> str:
|
|
| 16 |
TITLE = """<h1 align="left" id="space-title">OpenHands Index</h1>"""
|
| 17 |
|
| 18 |
INTRO_PARAGRAPH = """
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
<p>
|
| 20 |
<strong>OpenHands Index</strong> provides an aggregated view of agent performance and efficiency across all benchmarks in all categories. We report:
|
| 21 |
</p>
|
|
|
|
| 16 |
TITLE = """<h1 align="left" id="space-title">OpenHands Index</h1>"""
|
| 17 |
|
| 18 |
INTRO_PARAGRAPH = """
|
| 19 |
+
<p>
|
| 20 |
+
<b>Pre-release:</b> this codebase is not yet released.
|
| 21 |
+
</p>
|
| 22 |
+
|
| 23 |
<p>
|
| 24 |
<strong>OpenHands Index</strong> provides an aggregated view of agent performance and efficiency across all benchmarks in all categories. We report:
|
| 25 |
</p>
|
github_data_loader.py
DELETED
|
@@ -1,71 +0,0 @@
|
|
| 1 |
-
"""
|
| 2 |
-
Custom data loader for OpenHands Index that fetches from GitHub instead of HF datasets.
|
| 3 |
-
Mimics the interface of LeaderboardViewer from agent-eval.
|
| 4 |
-
"""
|
| 5 |
-
import pandas as pd
|
| 6 |
-
import requests
|
| 7 |
-
from typing import Dict, List, Tuple
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
class GitHubDataLoader:
|
| 11 |
-
"""Loads leaderboard data from GitHub repository."""
|
| 12 |
-
|
| 13 |
-
def __init__(self, base_url: str, split: str):
|
| 14 |
-
self.base_url = base_url
|
| 15 |
-
self.split = split
|
| 16 |
-
self.tag_map = self._build_tag_map()
|
| 17 |
-
|
| 18 |
-
def _build_tag_map(self) -> Dict[str, List[str]]:
|
| 19 |
-
"""Build tag map for the OpenHands datasets."""
|
| 20 |
-
# Map datasets to their respective tags
|
| 21 |
-
return {
|
| 22 |
-
"swe-bench": ["swe-bench"],
|
| 23 |
-
"multi-swe-bench": ["multi-swe-bench"],
|
| 24 |
-
"swe-bench-multimodal": ["swe-bench-multimodal"],
|
| 25 |
-
"swt-bench": ["swt-bench"],
|
| 26 |
-
"commit0": ["commit0"],
|
| 27 |
-
"gaia": ["gaia"],
|
| 28 |
-
}
|
| 29 |
-
|
| 30 |
-
def _load(self) -> Tuple[pd.DataFrame, Dict]:
|
| 31 |
-
"""Load and combine data from all GitHub JSON files."""
|
| 32 |
-
all_results = []
|
| 33 |
-
|
| 34 |
-
datasets = ["swe-bench", "multi-swe-bench", "swe-bench-multimodal",
|
| 35 |
-
"swt-bench", "commit0", "gaia"]
|
| 36 |
-
|
| 37 |
-
for dataset in datasets:
|
| 38 |
-
url = f"{self.base_url}/{dataset}.json"
|
| 39 |
-
try:
|
| 40 |
-
response = requests.get(url, timeout=10)
|
| 41 |
-
if response.status_code == 200:
|
| 42 |
-
data = response.json()
|
| 43 |
-
# Transform GitHub data to match agenteval format
|
| 44 |
-
for entry in data:
|
| 45 |
-
all_results.append({
|
| 46 |
-
"agent_name": entry.get("agent_name", "Unknown"),
|
| 47 |
-
"score": entry.get("score", 0.0),
|
| 48 |
-
"dataset": dataset,
|
| 49 |
-
"split": self.split,
|
| 50 |
-
# Add other fields as needed
|
| 51 |
-
})
|
| 52 |
-
except Exception as e:
|
| 53 |
-
print(f"Warning: Could not load data from {url}: {e}")
|
| 54 |
-
continue
|
| 55 |
-
|
| 56 |
-
if all_results:
|
| 57 |
-
df = pd.DataFrame(all_results)
|
| 58 |
-
return df, self.tag_map
|
| 59 |
-
else:
|
| 60 |
-
return pd.DataFrame(), self.tag_map
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
class DummyViewer:
|
| 64 |
-
"""Fallback viewer when data loading fails."""
|
| 65 |
-
|
| 66 |
-
def __init__(self, df: pd.DataFrame):
|
| 67 |
-
self._df = df
|
| 68 |
-
self.tag_map = {"Overall": []}
|
| 69 |
-
|
| 70 |
-
def _load(self) -> Tuple[pd.DataFrame, Dict]:
|
| 71 |
-
return self._df, self.tag_map
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
mock_results/1.0.0-dev1/agenteval.json
DELETED
|
@@ -1,74 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"suite_config": {
|
| 3 |
-
"name": "openhands-index",
|
| 4 |
-
"version": "1.0.0-dev1",
|
| 5 |
-
"splits": [
|
| 6 |
-
{
|
| 7 |
-
"name": "swe-bench",
|
| 8 |
-
"tasks": [
|
| 9 |
-
{
|
| 10 |
-
"name": "swe-bench",
|
| 11 |
-
"tags": [
|
| 12 |
-
"swe-bench"
|
| 13 |
-
]
|
| 14 |
-
}
|
| 15 |
-
]
|
| 16 |
-
},
|
| 17 |
-
{
|
| 18 |
-
"name": "multi-swe-bench",
|
| 19 |
-
"tasks": [
|
| 20 |
-
{
|
| 21 |
-
"name": "multi-swe-bench",
|
| 22 |
-
"tags": [
|
| 23 |
-
"multi-swe-bench"
|
| 24 |
-
]
|
| 25 |
-
}
|
| 26 |
-
]
|
| 27 |
-
},
|
| 28 |
-
{
|
| 29 |
-
"name": "swe-bench-multimodal",
|
| 30 |
-
"tasks": [
|
| 31 |
-
{
|
| 32 |
-
"name": "swe-bench-multimodal",
|
| 33 |
-
"tags": [
|
| 34 |
-
"swe-bench-multimodal"
|
| 35 |
-
]
|
| 36 |
-
}
|
| 37 |
-
]
|
| 38 |
-
},
|
| 39 |
-
{
|
| 40 |
-
"name": "swt-bench",
|
| 41 |
-
"tasks": [
|
| 42 |
-
{
|
| 43 |
-
"name": "swt-bench",
|
| 44 |
-
"tags": [
|
| 45 |
-
"swt-bench"
|
| 46 |
-
]
|
| 47 |
-
}
|
| 48 |
-
]
|
| 49 |
-
},
|
| 50 |
-
{
|
| 51 |
-
"name": "commit0",
|
| 52 |
-
"tasks": [
|
| 53 |
-
{
|
| 54 |
-
"name": "commit0",
|
| 55 |
-
"tags": [
|
| 56 |
-
"commit0"
|
| 57 |
-
]
|
| 58 |
-
}
|
| 59 |
-
]
|
| 60 |
-
},
|
| 61 |
-
{
|
| 62 |
-
"name": "gaia",
|
| 63 |
-
"tasks": [
|
| 64 |
-
{
|
| 65 |
-
"name": "gaia",
|
| 66 |
-
"tags": [
|
| 67 |
-
"gaia"
|
| 68 |
-
]
|
| 69 |
-
}
|
| 70 |
-
]
|
| 71 |
-
}
|
| 72 |
-
]
|
| 73 |
-
}
|
| 74 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
mock_results/1.0.0-dev1/commit0.jsonl
DELETED
|
@@ -1,5 +0,0 @@
|
|
| 1 |
-
{"agent_name": "1.0.2", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed", "score": 71.2, "metric": "test_pass_rate", "submission_time": "2025-11-25T00:12:22.972910", "tags": ["commit0"], "total_cost": 45.6, "total_runtime": 656.0}
|
| 2 |
-
{"agent_name": "1.0.1", "llm_base": "gpt-4o-2024-11-20", "openness": "closed", "score": 68.9, "metric": "test_pass_rate", "submission_time": "2025-11-25T00:12:22.972929", "tags": ["commit0"], "total_cost": 44.45, "total_runtime": 644.5}
|
| 3 |
-
{"agent_name": "1.0.0", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed", "score": 61.5, "metric": "test_pass_rate", "submission_time": "2025-11-25T00:12:22.972939", "tags": ["commit0"], "total_cost": 40.75, "total_runtime": 607.5}
|
| 4 |
-
{"agent_name": "0.9.5", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed", "score": 55.3, "metric": "test_pass_rate", "submission_time": "2025-11-25T00:12:22.972947", "tags": ["commit0"], "total_cost": 37.65, "total_runtime": 576.5}
|
| 5 |
-
{"agent_name": "0.9.0", "llm_base": "claude-3-opus-20240229", "openness": "closed", "score": 52.1, "metric": "test_pass_rate", "submission_time": "2025-11-25T00:12:22.972954", "tags": ["commit0"], "total_cost": 36.05, "total_runtime": 560.5}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
mock_results/1.0.0-dev1/gaia.jsonl
DELETED
|
@@ -1,5 +0,0 @@
|
|
| 1 |
-
{"agent_name": "1.0.2", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed", "score": 58.7, "metric": "accuracy", "submission_time": "2025-11-25T00:12:22.973093", "tags": ["gaia"], "total_cost": 39.35, "total_runtime": 593.5}
|
| 2 |
-
{"agent_name": "1.0.1", "llm_base": "gpt-4o-2024-11-20", "openness": "closed", "score": 55.2, "metric": "accuracy", "submission_time": "2025-11-25T00:12:22.973111", "tags": ["gaia"], "total_cost": 37.6, "total_runtime": 576.0}
|
| 3 |
-
{"agent_name": "1.0.0", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed", "score": 48.3, "metric": "accuracy", "submission_time": "2025-11-25T00:12:22.973121", "tags": ["gaia"], "total_cost": 34.15, "total_runtime": 541.5}
|
| 4 |
-
{"agent_name": "0.9.5", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed", "score": 42.1, "metric": "accuracy", "submission_time": "2025-11-25T00:12:22.973129", "tags": ["gaia"], "total_cost": 31.05, "total_runtime": 510.5}
|
| 5 |
-
{"agent_name": "0.9.0", "llm_base": "claude-3-opus-20240229", "openness": "closed", "score": 39.4, "metric": "accuracy", "submission_time": "2025-11-25T00:12:22.973137", "tags": ["gaia"], "total_cost": 29.7, "total_runtime": 497.0}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
mock_results/1.0.0-dev1/multi-swe-bench.jsonl
DELETED
|
@@ -1,5 +0,0 @@
|
|
| 1 |
-
{"agent_name": "1.0.2", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed", "score": 35.2, "metric": "resolve_rate", "submission_time": "2025-11-25T00:12:22.972368", "tags": ["multi-swe-bench"], "total_cost": 27.6, "total_runtime": 476.0}
|
| 2 |
-
{"agent_name": "1.0.1", "llm_base": "gpt-4o-2024-11-20", "openness": "closed", "score": 32.8, "metric": "resolve_rate", "submission_time": "2025-11-25T00:12:22.972389", "tags": ["multi-swe-bench"], "total_cost": 26.4, "total_runtime": 464.0}
|
| 3 |
-
{"agent_name": "1.0.0", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed", "score": 28.4, "metric": "resolve_rate", "submission_time": "2025-11-25T00:12:22.972400", "tags": ["multi-swe-bench"], "total_cost": 24.2, "total_runtime": 442.0}
|
| 4 |
-
{"agent_name": "0.9.5", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed", "score": 24.1, "metric": "resolve_rate", "submission_time": "2025-11-25T00:12:22.972408", "tags": ["multi-swe-bench"], "total_cost": 22.05, "total_runtime": 420.5}
|
| 5 |
-
{"agent_name": "0.9.0", "llm_base": "claude-3-opus-20240229", "openness": "closed", "score": 21.5, "metric": "resolve_rate", "submission_time": "2025-11-25T00:12:22.972416", "tags": ["multi-swe-bench"], "total_cost": 20.75, "total_runtime": 407.5}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
mock_results/1.0.0-dev1/results/20251124_claude_3_5_sonnet_20241022/metadata.json
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"agent_name": "OpenHands CodeAct v2.1",
|
| 3 |
+
"agent_version": "OpenHands CodeAct v2.1",
|
| 4 |
+
"model": "claude-3-5-sonnet-20241022",
|
| 5 |
+
"openness": "closed_api_available",
|
| 6 |
+
"tool_usage": "standard",
|
| 7 |
+
"submission_time": "2025-11-24T19:56:00.092865",
|
| 8 |
+
"directory_name": "20251124_claude_3_5_sonnet_20241022"
|
| 9 |
+
}
|
mock_results/1.0.0-dev1/results/20251124_claude_3_5_sonnet_20241022/scores.json
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"benchmark": "swe-bench",
|
| 4 |
+
"score": 48.3,
|
| 5 |
+
"metric": "resolve_rate",
|
| 6 |
+
"total_cost": 34.15,
|
| 7 |
+
"total_runtime": 541.5,
|
| 8 |
+
"tags": [
|
| 9 |
+
"swe-bench"
|
| 10 |
+
]
|
| 11 |
+
},
|
| 12 |
+
{
|
| 13 |
+
"benchmark": "swe-bench-multimodal",
|
| 14 |
+
"score": 42.1,
|
| 15 |
+
"metric": "resolve_rate",
|
| 16 |
+
"total_cost": 31.05,
|
| 17 |
+
"total_runtime": 510.5,
|
| 18 |
+
"tags": [
|
| 19 |
+
"swe-bench-multimodal"
|
| 20 |
+
]
|
| 21 |
+
},
|
| 22 |
+
{
|
| 23 |
+
"benchmark": "commit0",
|
| 24 |
+
"score": 71.2,
|
| 25 |
+
"metric": "test_pass_rate",
|
| 26 |
+
"total_cost": 45.6,
|
| 27 |
+
"total_runtime": 656.0,
|
| 28 |
+
"tags": [
|
| 29 |
+
"commit0"
|
| 30 |
+
]
|
| 31 |
+
},
|
| 32 |
+
{
|
| 33 |
+
"benchmark": "multi-swe-bench",
|
| 34 |
+
"score": 35.2,
|
| 35 |
+
"metric": "resolve_rate",
|
| 36 |
+
"total_cost": 27.6,
|
| 37 |
+
"total_runtime": 476.0,
|
| 38 |
+
"tags": [
|
| 39 |
+
"multi-swe-bench"
|
| 40 |
+
]
|
| 41 |
+
},
|
| 42 |
+
{
|
| 43 |
+
"benchmark": "swt-bench",
|
| 44 |
+
"score": 65.4,
|
| 45 |
+
"metric": "success_rate",
|
| 46 |
+
"total_cost": 42.7,
|
| 47 |
+
"total_runtime": 627.0,
|
| 48 |
+
"tags": [
|
| 49 |
+
"swt-bench"
|
| 50 |
+
]
|
| 51 |
+
},
|
| 52 |
+
{
|
| 53 |
+
"benchmark": "gaia",
|
| 54 |
+
"score": 58.7,
|
| 55 |
+
"metric": "accuracy",
|
| 56 |
+
"total_cost": 39.35,
|
| 57 |
+
"total_runtime": 593.5,
|
| 58 |
+
"tags": [
|
| 59 |
+
"gaia"
|
| 60 |
+
]
|
| 61 |
+
}
|
| 62 |
+
]
|
mock_results/1.0.0-dev1/results/20251124_claude_3_opus_20240229/metadata.json
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"agent_name": "SWE-Agent",
|
| 3 |
+
"agent_version": "SWE-Agent",
|
| 4 |
+
"model": "claude-3-opus-20240229",
|
| 5 |
+
"openness": "closed_api_available",
|
| 6 |
+
"tool_usage": "custom_interface",
|
| 7 |
+
"submission_time": "2025-11-24T19:56:00.092922",
|
| 8 |
+
"directory_name": "20251124_claude_3_opus_20240229"
|
| 9 |
+
}
|
mock_results/1.0.0-dev1/results/20251124_claude_3_opus_20240229/scores.json
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"benchmark": "swe-bench",
|
| 4 |
+
"score": 29.8,
|
| 5 |
+
"metric": "resolve_rate",
|
| 6 |
+
"total_cost": 24.9,
|
| 7 |
+
"total_runtime": 449.0,
|
| 8 |
+
"tags": [
|
| 9 |
+
"swe-bench"
|
| 10 |
+
]
|
| 11 |
+
},
|
| 12 |
+
{
|
| 13 |
+
"benchmark": "swe-bench-multimodal",
|
| 14 |
+
"score": 25.7,
|
| 15 |
+
"metric": "resolve_rate",
|
| 16 |
+
"total_cost": 22.85,
|
| 17 |
+
"total_runtime": 428.5,
|
| 18 |
+
"tags": [
|
| 19 |
+
"swe-bench-multimodal"
|
| 20 |
+
]
|
| 21 |
+
},
|
| 22 |
+
{
|
| 23 |
+
"benchmark": "commit0",
|
| 24 |
+
"score": 52.1,
|
| 25 |
+
"metric": "test_pass_rate",
|
| 26 |
+
"total_cost": 36.05,
|
| 27 |
+
"total_runtime": 560.5,
|
| 28 |
+
"tags": [
|
| 29 |
+
"commit0"
|
| 30 |
+
]
|
| 31 |
+
},
|
| 32 |
+
{
|
| 33 |
+
"benchmark": "multi-swe-bench",
|
| 34 |
+
"score": 21.5,
|
| 35 |
+
"metric": "resolve_rate",
|
| 36 |
+
"total_cost": 20.75,
|
| 37 |
+
"total_runtime": 407.5,
|
| 38 |
+
"tags": [
|
| 39 |
+
"multi-swe-bench"
|
| 40 |
+
]
|
| 41 |
+
},
|
| 42 |
+
{
|
| 43 |
+
"benchmark": "swt-bench",
|
| 44 |
+
"score": 44.2,
|
| 45 |
+
"metric": "success_rate",
|
| 46 |
+
"total_cost": 32.1,
|
| 47 |
+
"total_runtime": 521.0,
|
| 48 |
+
"tags": [
|
| 49 |
+
"swt-bench"
|
| 50 |
+
]
|
| 51 |
+
},
|
| 52 |
+
{
|
| 53 |
+
"benchmark": "gaia",
|
| 54 |
+
"score": 39.4,
|
| 55 |
+
"metric": "accuracy",
|
| 56 |
+
"total_cost": 29.7,
|
| 57 |
+
"total_runtime": 497.0,
|
| 58 |
+
"tags": [
|
| 59 |
+
"gaia"
|
| 60 |
+
]
|
| 61 |
+
}
|
| 62 |
+
]
|
mock_results/1.0.0-dev1/results/20251124_gpt_4_turbo_2024_04_09/metadata.json
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"agent_name": "AutoCodeRover",
|
| 3 |
+
"agent_version": "AutoCodeRover",
|
| 4 |
+
"model": "gpt-4-turbo-2024-04-09",
|
| 5 |
+
"openness": "closed_api_available",
|
| 6 |
+
"tool_usage": "standard",
|
| 7 |
+
"submission_time": "2025-11-24T19:56:00.092908",
|
| 8 |
+
"directory_name": "20251124_gpt_4_turbo_2024_04_09"
|
| 9 |
+
}
|
mock_results/1.0.0-dev1/results/20251124_gpt_4_turbo_2024_04_09/scores.json
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"benchmark": "swe-bench",
|
| 4 |
+
"score": 38.7,
|
| 5 |
+
"metric": "resolve_rate",
|
| 6 |
+
"total_cost": 29.35,
|
| 7 |
+
"total_runtime": 493.5,
|
| 8 |
+
"tags": [
|
| 9 |
+
"swe-bench"
|
| 10 |
+
]
|
| 11 |
+
},
|
| 12 |
+
{
|
| 13 |
+
"benchmark": "swe-bench-multimodal",
|
| 14 |
+
"score": 34.2,
|
| 15 |
+
"metric": "resolve_rate",
|
| 16 |
+
"total_cost": 27.1,
|
| 17 |
+
"total_runtime": 471.0,
|
| 18 |
+
"tags": [
|
| 19 |
+
"swe-bench-multimodal"
|
| 20 |
+
]
|
| 21 |
+
},
|
| 22 |
+
{
|
| 23 |
+
"benchmark": "commit0",
|
| 24 |
+
"score": 61.5,
|
| 25 |
+
"metric": "test_pass_rate",
|
| 26 |
+
"total_cost": 40.75,
|
| 27 |
+
"total_runtime": 607.5,
|
| 28 |
+
"tags": [
|
| 29 |
+
"commit0"
|
| 30 |
+
]
|
| 31 |
+
},
|
| 32 |
+
{
|
| 33 |
+
"benchmark": "multi-swe-bench",
|
| 34 |
+
"score": 28.4,
|
| 35 |
+
"metric": "resolve_rate",
|
| 36 |
+
"total_cost": 24.2,
|
| 37 |
+
"total_runtime": 442.0,
|
| 38 |
+
"tags": [
|
| 39 |
+
"multi-swe-bench"
|
| 40 |
+
]
|
| 41 |
+
},
|
| 42 |
+
{
|
| 43 |
+
"benchmark": "swt-bench",
|
| 44 |
+
"score": 54.1,
|
| 45 |
+
"metric": "success_rate",
|
| 46 |
+
"total_cost": 37.05,
|
| 47 |
+
"total_runtime": 570.5,
|
| 48 |
+
"tags": [
|
| 49 |
+
"swt-bench"
|
| 50 |
+
]
|
| 51 |
+
},
|
| 52 |
+
{
|
| 53 |
+
"benchmark": "gaia",
|
| 54 |
+
"score": 48.3,
|
| 55 |
+
"metric": "accuracy",
|
| 56 |
+
"total_cost": 34.15,
|
| 57 |
+
"total_runtime": 541.5,
|
| 58 |
+
"tags": [
|
| 59 |
+
"gaia"
|
| 60 |
+
]
|
| 61 |
+
}
|
| 62 |
+
]
|
mock_results/1.0.0-dev1/results/20251124_gpt_4o_2024_11_20/metadata.json
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"agent_name": "OpenHands CodeAct v2.0",
|
| 3 |
+
"agent_version": "OpenHands CodeAct v2.0",
|
| 4 |
+
"model": "gpt-4o-2024-11-20",
|
| 5 |
+
"openness": "closed_api_available",
|
| 6 |
+
"tool_usage": "standard",
|
| 7 |
+
"submission_time": "2025-11-24T19:56:00.092895",
|
| 8 |
+
"directory_name": "20251124_gpt_4o_2024_11_20"
|
| 9 |
+
}
|
mock_results/1.0.0-dev1/results/20251124_gpt_4o_2024_11_20/scores.json
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"benchmark": "swe-bench",
|
| 4 |
+
"score": 45.1,
|
| 5 |
+
"metric": "resolve_rate",
|
| 6 |
+
"total_cost": 32.55,
|
| 7 |
+
"total_runtime": 525.5,
|
| 8 |
+
"tags": [
|
| 9 |
+
"swe-bench"
|
| 10 |
+
]
|
| 11 |
+
},
|
| 12 |
+
{
|
| 13 |
+
"benchmark": "swe-bench-multimodal",
|
| 14 |
+
"score": 39.5,
|
| 15 |
+
"metric": "resolve_rate",
|
| 16 |
+
"total_cost": 29.75,
|
| 17 |
+
"total_runtime": 497.5,
|
| 18 |
+
"tags": [
|
| 19 |
+
"swe-bench-multimodal"
|
| 20 |
+
]
|
| 21 |
+
},
|
| 22 |
+
{
|
| 23 |
+
"benchmark": "commit0",
|
| 24 |
+
"score": 68.9,
|
| 25 |
+
"metric": "test_pass_rate",
|
| 26 |
+
"total_cost": 44.45,
|
| 27 |
+
"total_runtime": 644.5,
|
| 28 |
+
"tags": [
|
| 29 |
+
"commit0"
|
| 30 |
+
]
|
| 31 |
+
},
|
| 32 |
+
{
|
| 33 |
+
"benchmark": "multi-swe-bench",
|
| 34 |
+
"score": 32.8,
|
| 35 |
+
"metric": "resolve_rate",
|
| 36 |
+
"total_cost": 26.4,
|
| 37 |
+
"total_runtime": 464.0,
|
| 38 |
+
"tags": [
|
| 39 |
+
"multi-swe-bench"
|
| 40 |
+
]
|
| 41 |
+
},
|
| 42 |
+
{
|
| 43 |
+
"benchmark": "swt-bench",
|
| 44 |
+
"score": 62.3,
|
| 45 |
+
"metric": "success_rate",
|
| 46 |
+
"total_cost": 41.15,
|
| 47 |
+
"total_runtime": 611.5,
|
| 48 |
+
"tags": [
|
| 49 |
+
"swt-bench"
|
| 50 |
+
]
|
| 51 |
+
},
|
| 52 |
+
{
|
| 53 |
+
"benchmark": "gaia",
|
| 54 |
+
"score": 55.2,
|
| 55 |
+
"metric": "accuracy",
|
| 56 |
+
"total_cost": 37.6,
|
| 57 |
+
"total_runtime": 576.0,
|
| 58 |
+
"tags": [
|
| 59 |
+
"gaia"
|
| 60 |
+
]
|
| 61 |
+
}
|
| 62 |
+
]
|
mock_results/1.0.0-dev1/results/20251124_gpt_4o_mini_2024_07_18/metadata.json
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"agent_name": "Agentless",
|
| 3 |
+
"agent_version": "Agentless",
|
| 4 |
+
"model": "gpt-4o-mini-2024-07-18",
|
| 5 |
+
"openness": "closed_api_available",
|
| 6 |
+
"tool_usage": "standard",
|
| 7 |
+
"submission_time": "2025-11-24T19:56:00.092916",
|
| 8 |
+
"directory_name": "20251124_gpt_4o_mini_2024_07_18"
|
| 9 |
+
}
|
mock_results/1.0.0-dev1/results/20251124_gpt_4o_mini_2024_07_18/scores.json
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"benchmark": "swe-bench",
|
| 4 |
+
"score": 32.5,
|
| 5 |
+
"metric": "resolve_rate",
|
| 6 |
+
"total_cost": 26.25,
|
| 7 |
+
"total_runtime": 462.5,
|
| 8 |
+
"tags": [
|
| 9 |
+
"swe-bench"
|
| 10 |
+
]
|
| 11 |
+
},
|
| 12 |
+
{
|
| 13 |
+
"benchmark": "swe-bench-multimodal",
|
| 14 |
+
"score": 28.9,
|
| 15 |
+
"metric": "resolve_rate",
|
| 16 |
+
"total_cost": 24.45,
|
| 17 |
+
"total_runtime": 444.5,
|
| 18 |
+
"tags": [
|
| 19 |
+
"swe-bench-multimodal"
|
| 20 |
+
]
|
| 21 |
+
},
|
| 22 |
+
{
|
| 23 |
+
"benchmark": "commit0",
|
| 24 |
+
"score": 55.3,
|
| 25 |
+
"metric": "test_pass_rate",
|
| 26 |
+
"total_cost": 37.65,
|
| 27 |
+
"total_runtime": 576.5,
|
| 28 |
+
"tags": [
|
| 29 |
+
"commit0"
|
| 30 |
+
]
|
| 31 |
+
},
|
| 32 |
+
{
|
| 33 |
+
"benchmark": "multi-swe-bench",
|
| 34 |
+
"score": 24.1,
|
| 35 |
+
"metric": "resolve_rate",
|
| 36 |
+
"total_cost": 22.05,
|
| 37 |
+
"total_runtime": 420.5,
|
| 38 |
+
"tags": [
|
| 39 |
+
"multi-swe-bench"
|
| 40 |
+
]
|
| 41 |
+
},
|
| 42 |
+
{
|
| 43 |
+
"benchmark": "swt-bench",
|
| 44 |
+
"score": 47.8,
|
| 45 |
+
"metric": "success_rate",
|
| 46 |
+
"total_cost": 33.9,
|
| 47 |
+
"total_runtime": 539.0,
|
| 48 |
+
"tags": [
|
| 49 |
+
"swt-bench"
|
| 50 |
+
]
|
| 51 |
+
},
|
| 52 |
+
{
|
| 53 |
+
"benchmark": "gaia",
|
| 54 |
+
"score": 42.1,
|
| 55 |
+
"metric": "accuracy",
|
| 56 |
+
"total_cost": 31.05,
|
| 57 |
+
"total_runtime": 510.5,
|
| 58 |
+
"tags": [
|
| 59 |
+
"gaia"
|
| 60 |
+
]
|
| 61 |
+
}
|
| 62 |
+
]
|
mock_results/1.0.0-dev1/swe-bench-multimodal.jsonl
DELETED
|
@@ -1,5 +0,0 @@
|
|
| 1 |
-
{"agent_name": "1.0.2", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed", "score": 42.1, "metric": "resolve_rate", "submission_time": "2025-11-25T00:12:22.972550", "tags": ["swe-bench-multimodal"], "total_cost": 31.05, "total_runtime": 510.5}
|
| 2 |
-
{"agent_name": "1.0.1", "llm_base": "gpt-4o-2024-11-20", "openness": "closed", "score": 39.5, "metric": "resolve_rate", "submission_time": "2025-11-25T00:12:22.972567", "tags": ["swe-bench-multimodal"], "total_cost": 29.75, "total_runtime": 497.5}
|
| 3 |
-
{"agent_name": "1.0.0", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed", "score": 34.2, "metric": "resolve_rate", "submission_time": "2025-11-25T00:12:22.972577", "tags": ["swe-bench-multimodal"], "total_cost": 27.1, "total_runtime": 471.0}
|
| 4 |
-
{"agent_name": "0.9.5", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed", "score": 28.9, "metric": "resolve_rate", "submission_time": "2025-11-25T00:12:22.972585", "tags": ["swe-bench-multimodal"], "total_cost": 24.45, "total_runtime": 444.5}
|
| 5 |
-
{"agent_name": "0.9.0", "llm_base": "claude-3-opus-20240229", "openness": "closed", "score": 25.7, "metric": "resolve_rate", "submission_time": "2025-11-25T00:12:22.972593", "tags": ["swe-bench-multimodal"], "total_cost": 22.85, "total_runtime": 428.5}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
mock_results/1.0.0-dev1/swe-bench.jsonl
DELETED
|
@@ -1,5 +0,0 @@
|
|
| 1 |
-
{"agent_name": "1.0.2", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed", "score": 48.3, "metric": "resolve_rate", "submission_time": "2025-11-25T00:12:22.972101", "tags": ["swe-bench"], "total_cost": 34.15, "total_runtime": 541.5}
|
| 2 |
-
{"agent_name": "1.0.1", "llm_base": "gpt-4o-2024-11-20", "openness": "closed", "score": 45.1, "metric": "resolve_rate", "submission_time": "2025-11-25T00:12:22.972136", "tags": ["swe-bench"], "total_cost": 32.55, "total_runtime": 525.5}
|
| 3 |
-
{"agent_name": "1.0.0", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed", "score": 38.7, "metric": "resolve_rate", "submission_time": "2025-11-25T00:12:22.972167", "tags": ["swe-bench"], "total_cost": 29.35, "total_runtime": 493.5}
|
| 4 |
-
{"agent_name": "0.9.5", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed", "score": 32.5, "metric": "resolve_rate", "submission_time": "2025-11-25T00:12:22.972178", "tags": ["swe-bench"], "total_cost": 26.25, "total_runtime": 462.5}
|
| 5 |
-
{"agent_name": "0.9.0", "llm_base": "claude-3-opus-20240229", "openness": "closed", "score": 29.8, "metric": "resolve_rate", "submission_time": "2025-11-25T00:12:22.972186", "tags": ["swe-bench"], "total_cost": 24.9, "total_runtime": 449.0}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
mock_results/1.0.0-dev1/swt-bench.jsonl
DELETED
|
@@ -1,5 +0,0 @@
|
|
| 1 |
-
{"agent_name": "1.0.2", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed", "score": 65.4, "metric": "success_rate", "submission_time": "2025-11-25T00:12:22.972724", "tags": ["swt-bench"], "total_cost": 42.7, "total_runtime": 627.0}
|
| 2 |
-
{"agent_name": "1.0.1", "llm_base": "gpt-4o-2024-11-20", "openness": "closed", "score": 62.3, "metric": "success_rate", "submission_time": "2025-11-25T00:12:22.972741", "tags": ["swt-bench"], "total_cost": 41.15, "total_runtime": 611.5}
|
| 3 |
-
{"agent_name": "1.0.0", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed", "score": 54.1, "metric": "success_rate", "submission_time": "2025-11-25T00:12:22.972750", "tags": ["swt-bench"], "total_cost": 37.05, "total_runtime": 570.5}
|
| 4 |
-
{"agent_name": "0.9.5", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed", "score": 47.8, "metric": "success_rate", "submission_time": "2025-11-25T00:12:22.972758", "tags": ["swt-bench"], "total_cost": 33.9, "total_runtime": 539.0}
|
| 5 |
-
{"agent_name": "0.9.0", "llm_base": "claude-3-opus-20240229", "openness": "closed", "score": 44.2, "metric": "success_rate", "submission_time": "2025-11-25T00:12:22.972765", "tags": ["swt-bench"], "total_cost": 32.1, "total_runtime": 521.0}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
mock_results/1.0.0-dev1/test.jsonl
DELETED
|
@@ -1,30 +0,0 @@
|
|
| 1 |
-
{"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 48.3, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.092865", "tags": ["swe-bench"], "total_cost": 34.15, "total_runtime": 541.5}
|
| 2 |
-
{"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 45.1, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.092895", "tags": ["swe-bench"], "total_cost": 32.55, "total_runtime": 525.5}
|
| 3 |
-
{"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 38.7, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.092908", "tags": ["swe-bench"], "total_cost": 29.35, "total_runtime": 493.5}
|
| 4 |
-
{"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 32.5, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.092916", "tags": ["swe-bench"], "total_cost": 26.25, "total_runtime": 462.5}
|
| 5 |
-
{"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 29.8, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.092922", "tags": ["swe-bench"], "total_cost": 24.9, "total_runtime": 449.0}
|
| 6 |
-
{"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 35.2, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093026", "tags": ["multi-swe-bench"], "total_cost": 27.6, "total_runtime": 476.0}
|
| 7 |
-
{"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 32.8, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093040", "tags": ["multi-swe-bench"], "total_cost": 26.4, "total_runtime": 464.0}
|
| 8 |
-
{"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 28.4, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093048", "tags": ["multi-swe-bench"], "total_cost": 24.2, "total_runtime": 442.0}
|
| 9 |
-
{"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 24.1, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093058", "tags": ["multi-swe-bench"], "total_cost": 22.05, "total_runtime": 420.5}
|
| 10 |
-
{"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 21.5, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093067", "tags": ["multi-swe-bench"], "total_cost": 20.75, "total_runtime": 407.5}
|
| 11 |
-
{"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 42.1, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093172", "tags": ["swe-bench-multimodal"], "total_cost": 31.05, "total_runtime": 510.5}
|
| 12 |
-
{"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 39.5, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093187", "tags": ["swe-bench-multimodal"], "total_cost": 29.75, "total_runtime": 497.5}
|
| 13 |
-
{"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 34.2, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093200", "tags": ["swe-bench-multimodal"], "total_cost": 27.1, "total_runtime": 471.0}
|
| 14 |
-
{"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 28.9, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093208", "tags": ["swe-bench-multimodal"], "total_cost": 24.45, "total_runtime": 444.5}
|
| 15 |
-
{"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 25.7, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093218", "tags": ["swe-bench-multimodal"], "total_cost": 22.85, "total_runtime": 428.5}
|
| 16 |
-
{"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 65.4, "metric": "success_rate", "submission_time": "2025-11-24T19:56:00.093304", "tags": ["swt-bench"], "total_cost": 42.7, "total_runtime": 627.0}
|
| 17 |
-
{"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 62.3, "metric": "success_rate", "submission_time": "2025-11-24T19:56:00.093317", "tags": ["swt-bench"], "total_cost": 41.15, "total_runtime": 611.5}
|
| 18 |
-
{"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 54.1, "metric": "success_rate", "submission_time": "2025-11-24T19:56:00.093326", "tags": ["swt-bench"], "total_cost": 37.05, "total_runtime": 570.5}
|
| 19 |
-
{"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 47.8, "metric": "success_rate", "submission_time": "2025-11-24T19:56:00.093333", "tags": ["swt-bench"], "total_cost": 33.9, "total_runtime": 539.0}
|
| 20 |
-
{"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 44.2, "metric": "success_rate", "submission_time": "2025-11-24T19:56:00.093338", "tags": ["swt-bench"], "total_cost": 32.1, "total_runtime": 521.0}
|
| 21 |
-
{"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 71.2, "metric": "test_pass_rate", "submission_time": "2025-11-24T19:56:00.093416", "tags": ["commit0"], "total_cost": 45.6, "total_runtime": 656.0}
|
| 22 |
-
{"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 68.9, "metric": "test_pass_rate", "submission_time": "2025-11-24T19:56:00.093428", "tags": ["commit0"], "total_cost": 44.45, "total_runtime": 644.5}
|
| 23 |
-
{"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 61.5, "metric": "test_pass_rate", "submission_time": "2025-11-24T19:56:00.093436", "tags": ["commit0"], "total_cost": 40.75, "total_runtime": 607.5}
|
| 24 |
-
{"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 55.3, "metric": "test_pass_rate", "submission_time": "2025-11-24T19:56:00.093445", "tags": ["commit0"], "total_cost": 37.65, "total_runtime": 576.5}
|
| 25 |
-
{"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 52.1, "metric": "test_pass_rate", "submission_time": "2025-11-24T19:56:00.093450", "tags": ["commit0"], "total_cost": 36.05, "total_runtime": 560.5}
|
| 26 |
-
{"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 58.7, "metric": "accuracy", "submission_time": "2025-11-24T19:56:00.093541", "tags": ["gaia"], "total_cost": 39.35, "total_runtime": 593.5}
|
| 27 |
-
{"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 55.2, "metric": "accuracy", "submission_time": "2025-11-24T19:56:00.093556", "tags": ["gaia"], "total_cost": 37.6, "total_runtime": 576.0}
|
| 28 |
-
{"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 48.3, "metric": "accuracy", "submission_time": "2025-11-24T19:56:00.093563", "tags": ["gaia"], "total_cost": 34.15, "total_runtime": 541.5}
|
| 29 |
-
{"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 42.1, "metric": "accuracy", "submission_time": "2025-11-24T19:56:00.093568", "tags": ["gaia"], "total_cost": 31.05, "total_runtime": 510.5}
|
| 30 |
-
{"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 39.4, "metric": "accuracy", "submission_time": "2025-11-24T19:56:00.093574", "tags": ["gaia"], "total_cost": 29.7, "total_runtime": 497.0}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
mock_results/1.0.0-dev1/test.parquet
DELETED
|
Binary file (9.26 kB)
|
|
|
mock_results/1.0.0-dev1/validation.jsonl
DELETED
|
@@ -1,30 +0,0 @@
|
|
| 1 |
-
{"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 48.3, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.092865", "tags": ["swe-bench"], "total_cost": 34.15, "total_runtime": 541.5}
|
| 2 |
-
{"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 45.1, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.092895", "tags": ["swe-bench"], "total_cost": 32.55, "total_runtime": 525.5}
|
| 3 |
-
{"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 38.7, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.092908", "tags": ["swe-bench"], "total_cost": 29.35, "total_runtime": 493.5}
|
| 4 |
-
{"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 32.5, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.092916", "tags": ["swe-bench"], "total_cost": 26.25, "total_runtime": 462.5}
|
| 5 |
-
{"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 29.8, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.092922", "tags": ["swe-bench"], "total_cost": 24.9, "total_runtime": 449.0}
|
| 6 |
-
{"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 35.2, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093026", "tags": ["multi-swe-bench"], "total_cost": 27.6, "total_runtime": 476.0}
|
| 7 |
-
{"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 32.8, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093040", "tags": ["multi-swe-bench"], "total_cost": 26.4, "total_runtime": 464.0}
|
| 8 |
-
{"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 28.4, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093048", "tags": ["multi-swe-bench"], "total_cost": 24.2, "total_runtime": 442.0}
|
| 9 |
-
{"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 24.1, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093058", "tags": ["multi-swe-bench"], "total_cost": 22.05, "total_runtime": 420.5}
|
| 10 |
-
{"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 21.5, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093067", "tags": ["multi-swe-bench"], "total_cost": 20.75, "total_runtime": 407.5}
|
| 11 |
-
{"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 42.1, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093172", "tags": ["swe-bench-multimodal"], "total_cost": 31.05, "total_runtime": 510.5}
|
| 12 |
-
{"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 39.5, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093187", "tags": ["swe-bench-multimodal"], "total_cost": 29.75, "total_runtime": 497.5}
|
| 13 |
-
{"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 34.2, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093200", "tags": ["swe-bench-multimodal"], "total_cost": 27.1, "total_runtime": 471.0}
|
| 14 |
-
{"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 28.9, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093208", "tags": ["swe-bench-multimodal"], "total_cost": 24.45, "total_runtime": 444.5}
|
| 15 |
-
{"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 25.7, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093218", "tags": ["swe-bench-multimodal"], "total_cost": 22.85, "total_runtime": 428.5}
|
| 16 |
-
{"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 65.4, "metric": "success_rate", "submission_time": "2025-11-24T19:56:00.093304", "tags": ["swt-bench"], "total_cost": 42.7, "total_runtime": 627.0}
|
| 17 |
-
{"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 62.3, "metric": "success_rate", "submission_time": "2025-11-24T19:56:00.093317", "tags": ["swt-bench"], "total_cost": 41.15, "total_runtime": 611.5}
|
| 18 |
-
{"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 54.1, "metric": "success_rate", "submission_time": "2025-11-24T19:56:00.093326", "tags": ["swt-bench"], "total_cost": 37.05, "total_runtime": 570.5}
|
| 19 |
-
{"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 47.8, "metric": "success_rate", "submission_time": "2025-11-24T19:56:00.093333", "tags": ["swt-bench"], "total_cost": 33.9, "total_runtime": 539.0}
|
| 20 |
-
{"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 44.2, "metric": "success_rate", "submission_time": "2025-11-24T19:56:00.093338", "tags": ["swt-bench"], "total_cost": 32.1, "total_runtime": 521.0}
|
| 21 |
-
{"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 71.2, "metric": "test_pass_rate", "submission_time": "2025-11-24T19:56:00.093416", "tags": ["commit0"], "total_cost": 45.6, "total_runtime": 656.0}
|
| 22 |
-
{"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 68.9, "metric": "test_pass_rate", "submission_time": "2025-11-24T19:56:00.093428", "tags": ["commit0"], "total_cost": 44.45, "total_runtime": 644.5}
|
| 23 |
-
{"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 61.5, "metric": "test_pass_rate", "submission_time": "2025-11-24T19:56:00.093436", "tags": ["commit0"], "total_cost": 40.75, "total_runtime": 607.5}
|
| 24 |
-
{"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 55.3, "metric": "test_pass_rate", "submission_time": "2025-11-24T19:56:00.093445", "tags": ["commit0"], "total_cost": 37.65, "total_runtime": 576.5}
|
| 25 |
-
{"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 52.1, "metric": "test_pass_rate", "submission_time": "2025-11-24T19:56:00.093450", "tags": ["commit0"], "total_cost": 36.05, "total_runtime": 560.5}
|
| 26 |
-
{"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 58.7, "metric": "accuracy", "submission_time": "2025-11-24T19:56:00.093541", "tags": ["gaia"], "total_cost": 39.35, "total_runtime": 593.5}
|
| 27 |
-
{"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 55.2, "metric": "accuracy", "submission_time": "2025-11-24T19:56:00.093556", "tags": ["gaia"], "total_cost": 37.6, "total_runtime": 576.0}
|
| 28 |
-
{"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 48.3, "metric": "accuracy", "submission_time": "2025-11-24T19:56:00.093563", "tags": ["gaia"], "total_cost": 34.15, "total_runtime": 541.5}
|
| 29 |
-
{"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 42.1, "metric": "accuracy", "submission_time": "2025-11-24T19:56:00.093568", "tags": ["gaia"], "total_cost": 31.05, "total_runtime": 510.5}
|
| 30 |
-
{"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 39.4, "metric": "accuracy", "submission_time": "2025-11-24T19:56:00.093574", "tags": ["gaia"], "total_cost": 29.7, "total_runtime": 497.0}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
mock_results/1.0.0-dev1/validation.parquet
DELETED
|
Binary file (9.29 kB)
|
|
|
requirements.txt
CHANGED
|
@@ -4,7 +4,6 @@ pandas==2.2.3
|
|
| 4 |
plotly==6.0.1
|
| 5 |
requests==2.32.3
|
| 6 |
huggingface-hub==0.30.2
|
| 7 |
-
APScheduler==3.11.0
|
| 8 |
|
| 9 |
# Additional dependencies for UI and processing
|
| 10 |
matplotlib==3.10.3
|
|
|
|
| 4 |
plotly==6.0.1
|
| 5 |
requests==2.32.3
|
| 6 |
huggingface-hub==0.30.2
|
|
|
|
| 7 |
|
| 8 |
# Additional dependencies for UI and processing
|
| 9 |
matplotlib==3.10.3
|
setup_data.py
CHANGED
|
@@ -44,33 +44,29 @@ def fetch_data_from_github():
|
|
| 44 |
return False
|
| 45 |
|
| 46 |
# Look for data files in the cloned repository
|
| 47 |
-
# Expected structure: openhands-index-results/
|
| 48 |
-
|
| 49 |
|
| 50 |
-
if not
|
| 51 |
-
print(f"
|
| 52 |
-
|
| 53 |
-
version_dirs = list(temp_clone_dir.glob("*.*.*"))
|
| 54 |
-
if version_dirs:
|
| 55 |
-
print(f"Found version directories: {[d.name for d in version_dirs]}")
|
| 56 |
-
# Use the first available version
|
| 57 |
-
data_source = version_dirs[0]
|
| 58 |
-
print(f"Using data from {data_source.name}")
|
| 59 |
-
else:
|
| 60 |
-
print("No data found in repository")
|
| 61 |
-
return False
|
| 62 |
|
| 63 |
-
# Check if there are any
|
| 64 |
-
|
| 65 |
-
if not
|
| 66 |
-
print(f"No
|
| 67 |
return False
|
| 68 |
|
| 69 |
-
|
|
|
|
|
|
|
| 70 |
os.makedirs(target_dir.parent, exist_ok=True)
|
| 71 |
if target_dir.exists():
|
| 72 |
shutil.rmtree(target_dir)
|
| 73 |
-
|
|
|
|
|
|
|
|
|
|
| 74 |
|
| 75 |
print(f"Successfully fetched data from GitHub. Files: {list(target_dir.glob('*'))}")
|
| 76 |
return True
|
|
|
|
| 44 |
return False
|
| 45 |
|
| 46 |
# Look for data files in the cloned repository
|
| 47 |
+
# Expected structure: openhands-index-results/results/YYYYMMDD_model/
|
| 48 |
+
results_source = temp_clone_dir / "results"
|
| 49 |
|
| 50 |
+
if not results_source.exists():
|
| 51 |
+
print(f"Results directory not found in repository")
|
| 52 |
+
return False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
|
| 54 |
+
# Check if there are any agent result directories
|
| 55 |
+
result_dirs = list(results_source.iterdir())
|
| 56 |
+
if not result_dirs:
|
| 57 |
+
print(f"No agent results found in {results_source}")
|
| 58 |
return False
|
| 59 |
|
| 60 |
+
print(f"Found {len(result_dirs)} agent result directories")
|
| 61 |
+
|
| 62 |
+
# Create target directory and copy the results structure
|
| 63 |
os.makedirs(target_dir.parent, exist_ok=True)
|
| 64 |
if target_dir.exists():
|
| 65 |
shutil.rmtree(target_dir)
|
| 66 |
+
|
| 67 |
+
# Copy the entire results directory
|
| 68 |
+
target_results = target_dir / "results"
|
| 69 |
+
shutil.copytree(results_source, target_results)
|
| 70 |
|
| 71 |
print(f"Successfully fetched data from GitHub. Files: {list(target_dir.glob('*'))}")
|
| 72 |
return True
|
simple_data_loader.py
CHANGED
|
@@ -103,43 +103,16 @@ class SimpleLeaderboardViewer:
|
|
| 103 |
return pd.DataFrame(all_records)
|
| 104 |
|
| 105 |
def _load(self):
|
| 106 |
-
"""Load
|
| 107 |
-
# Try new format first (agent-centric directories)
|
| 108 |
df = self._load_from_agent_dirs()
|
| 109 |
|
| 110 |
if df is None:
|
| 111 |
-
#
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
# Return empty dataframe with error message
|
| 116 |
-
return pd.DataFrame({
|
| 117 |
-
"Message": [f"No data found for split '{self.split}'. Expected file: {jsonl_file}"]
|
| 118 |
-
}), {}
|
| 119 |
-
|
| 120 |
-
try:
|
| 121 |
-
# Read JSONL file
|
| 122 |
-
records = []
|
| 123 |
-
with open(jsonl_file, 'r') as f:
|
| 124 |
-
for line in f:
|
| 125 |
-
if line.strip():
|
| 126 |
-
records.append(json.loads(line))
|
| 127 |
-
|
| 128 |
-
if not records:
|
| 129 |
-
return pd.DataFrame({
|
| 130 |
-
"Message": [f"No data in file: {jsonl_file}"]
|
| 131 |
-
}), {}
|
| 132 |
-
|
| 133 |
-
# Convert to DataFrame
|
| 134 |
-
df = pd.DataFrame(records)
|
| 135 |
-
except Exception as e:
|
| 136 |
-
import traceback
|
| 137 |
-
traceback.print_exc()
|
| 138 |
-
return pd.DataFrame({
|
| 139 |
-
"Message": [f"Error loading data: {e}"]
|
| 140 |
-
}), {}
|
| 141 |
|
| 142 |
-
#
|
| 143 |
try:
|
| 144 |
|
| 145 |
# Transform to expected format for leaderboard
|
|
|
|
| 103 |
return pd.DataFrame(all_records)
|
| 104 |
|
| 105 |
def _load(self):
|
| 106 |
+
"""Load data from agent-centric directories and return DataFrame and tag map."""
|
|
|
|
| 107 |
df = self._load_from_agent_dirs()
|
| 108 |
|
| 109 |
if df is None:
|
| 110 |
+
# Return empty dataframe with error message
|
| 111 |
+
return pd.DataFrame({
|
| 112 |
+
"Message": [f"No data found for split '{self.split}' in results directory"]
|
| 113 |
+
}), {}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
|
| 115 |
+
# Process the dataframe
|
| 116 |
try:
|
| 117 |
|
| 118 |
# Transform to expected format for leaderboard
|