openhands openhands commited on
Commit
855423e
·
1 Parent(s): 64c8899

Cleanup codebase: remove unused code, simplify data loading, and add pre-release notice

Browse files

- Remove unused github_data_loader.py (71 lines of dead code)
- Clean up debug logging in app.py (replaced ~40 print statements with proper logger)
- Remove scheduler code and APScheduler dependency from requirements.txt
- Simplify simple_data_loader.py by removing old JSONL format fallback
- Update setup_data.py to fetch data from results/ directory only
- Update mock data to use new agent-centric directory structure
- Add pre-release notice to main page intro paragraph

Co-authored-by: openhands <[email protected]>

Files changed (27) hide show
  1. app.py +14 -68
  2. content.py +4 -0
  3. github_data_loader.py +0 -71
  4. mock_results/1.0.0-dev1/agenteval.json +0 -74
  5. mock_results/1.0.0-dev1/commit0.jsonl +0 -5
  6. mock_results/1.0.0-dev1/gaia.jsonl +0 -5
  7. mock_results/1.0.0-dev1/multi-swe-bench.jsonl +0 -5
  8. mock_results/1.0.0-dev1/results/20251124_claude_3_5_sonnet_20241022/metadata.json +9 -0
  9. mock_results/1.0.0-dev1/results/20251124_claude_3_5_sonnet_20241022/scores.json +62 -0
  10. mock_results/1.0.0-dev1/results/20251124_claude_3_opus_20240229/metadata.json +9 -0
  11. mock_results/1.0.0-dev1/results/20251124_claude_3_opus_20240229/scores.json +62 -0
  12. mock_results/1.0.0-dev1/results/20251124_gpt_4_turbo_2024_04_09/metadata.json +9 -0
  13. mock_results/1.0.0-dev1/results/20251124_gpt_4_turbo_2024_04_09/scores.json +62 -0
  14. mock_results/1.0.0-dev1/results/20251124_gpt_4o_2024_11_20/metadata.json +9 -0
  15. mock_results/1.0.0-dev1/results/20251124_gpt_4o_2024_11_20/scores.json +62 -0
  16. mock_results/1.0.0-dev1/results/20251124_gpt_4o_mini_2024_07_18/metadata.json +9 -0
  17. mock_results/1.0.0-dev1/results/20251124_gpt_4o_mini_2024_07_18/scores.json +62 -0
  18. mock_results/1.0.0-dev1/swe-bench-multimodal.jsonl +0 -5
  19. mock_results/1.0.0-dev1/swe-bench.jsonl +0 -5
  20. mock_results/1.0.0-dev1/swt-bench.jsonl +0 -5
  21. mock_results/1.0.0-dev1/test.jsonl +0 -30
  22. mock_results/1.0.0-dev1/test.parquet +0 -0
  23. mock_results/1.0.0-dev1/validation.jsonl +0 -30
  24. mock_results/1.0.0-dev1/validation.parquet +0 -0
  25. requirements.txt +0 -1
  26. setup_data.py +16 -20
  27. simple_data_loader.py +6 -33
app.py CHANGED
@@ -2,56 +2,34 @@
2
  import logging
3
  import sys
4
 
5
- logging.basicConfig(level=logging.INFO) # Changed to INFO for better debugging
 
6
 
7
- print("=" * 80, file=sys.stderr)
8
- print("STARTING APP.PY", file=sys.stderr)
9
- print("=" * 80, file=sys.stderr)
10
 
11
  # Setup mock data before anything else
12
  try:
13
- print("Importing setup_data module...", file=sys.stderr)
14
  from setup_data import setup_mock_data
15
- print("Calling setup_mock_data()...", file=sys.stderr)
16
  setup_mock_data()
17
- print("Data setup completed successfully", file=sys.stderr)
18
  except Exception as e:
19
- print(f"!!! ERROR during data setup: {e}", file=sys.stderr)
20
- import traceback
21
- traceback.print_exc()
22
- print("Continuing with app startup despite error...", file=sys.stderr)
23
 
24
- print("Importing gradio...", file=sys.stderr)
25
  import gradio as gr
26
- print("✓ Gradio imported", file=sys.stderr)
27
  import urllib.parse
28
-
29
- print("Importing dependencies...", file=sys.stderr)
30
- from apscheduler.schedulers.background import BackgroundScheduler
31
  from huggingface_hub import HfApi
32
- print("✓ Dependencies imported", file=sys.stderr)
33
-
34
- print("Importing config...", file=sys.stderr)
35
  from config import LEADERBOARD_PATH, LOCAL_DEBUG
36
- print(f"✓ Config imported (LOCAL_DEBUG={LOCAL_DEBUG})", file=sys.stderr)
37
-
38
- print("Importing content and pages...", file=sys.stderr)
39
  from content import css
40
- print("✓ css imported", file=sys.stderr)
41
  from main_page import build_page as build_main_page
42
- print("✓ main_page imported", file=sys.stderr)
43
  from bug_fixing import build_page as build_bug_fixing_page
44
- print("✓ bug_fixing imported", file=sys.stderr)
45
  from app_creation import build_page as build_app_creation_page
46
- print("✓ app_creation imported", file=sys.stderr)
47
  from frontend_development import build_page as build_frontend_page
48
- print("✓ frontend_development imported", file=sys.stderr)
49
  from test_generation import build_page as build_test_generation_page
50
- print("✓ test_generation imported", file=sys.stderr)
51
  from information_gathering import build_page as build_information_gathering_page
52
- print("✓ information_gathering imported", file=sys.stderr)
53
  from about import build_page as build_about_page
54
- print("✓ All pages imported", file=sys.stderr)
 
55
 
56
  api = HfApi()
57
  LOGO_PATH = "assets/logo.svg"
@@ -193,7 +171,7 @@ try:
193
  encoded_svg = urllib.parse.quote(svg_content)
194
  home_icon_data_uri = f"data:image/svg+xml,{encoded_svg}"
195
  except FileNotFoundError:
196
- print(f"Warning: Home icon file not found at {LOGO_PATH}.")
197
  home_icon_data_uri = "none"
198
 
199
  # --- This is the final CSS ---
@@ -228,77 +206,45 @@ final_css = css + f"""
228
  }}
229
  """
230
  # --- Gradio App Definition ---
231
- print("Creating Gradio Blocks...", file=sys.stderr)
232
  demo = gr.Blocks(
233
  theme=theme,
234
  css=final_css,
235
  head=scroll_script + redirect_script + tooltip_script,
236
  title="OpenHands Index",
237
  )
238
- print("✓ Gradio Blocks created", file=sys.stderr)
239
 
240
- print("Building Home page route...", file=sys.stderr)
241
  with demo.route("Home", "/home"):
242
  build_main_page()
243
- print("✓ Home page built", file=sys.stderr)
244
 
245
- print("Building Bug Fixing page route...", file=sys.stderr)
246
  with demo.route("Bug Fixing", "/bug-fixing"):
247
  build_bug_fixing_page()
248
- print("✓ Bug Fixing page built", file=sys.stderr)
249
 
250
- print("Building App Creation page route...", file=sys.stderr)
251
  with demo.route("App Creation", "/app-creation"):
252
  build_app_creation_page()
253
- print("✓ App Creation page built", file=sys.stderr)
254
 
255
- print("Building Frontend Development page route...", file=sys.stderr)
256
  with demo.route("Frontend Development", "/frontend-development"):
257
  build_frontend_page()
258
- print("✓ Frontend Development page built", file=sys.stderr)
259
 
260
- print("Building Test Generation page route...", file=sys.stderr)
261
  with demo.route("Test Generation", "/test-generation"):
262
  build_test_generation_page()
263
- print("✓ Test Generation page built", file=sys.stderr)
264
 
265
- print("Building Information Gathering page route...", file=sys.stderr)
266
  with demo.route("Information Gathering", "/information-gathering"):
267
  build_information_gathering_page()
268
- print("✓ Information Gathering page built", file=sys.stderr)
269
 
270
- print("Building About page route...", file=sys.stderr)
271
  with demo.route("About", "/about"):
272
  build_about_page()
273
- print("✓ About page built", file=sys.stderr)
274
-
275
- # --- Scheduler and Launch
276
- def restart_space_job():
277
- print("Scheduler: Attempting to restart space.")
278
- try:
279
- api.restart_space(repo_id=LEADERBOARD_PATH)
280
- print("Scheduler: Space restart request sent.")
281
- except Exception as e:
282
- print(f"Scheduler: Error restarting space: {e}")
283
 
284
- # Disabled scheduler for now
285
- # scheduler = BackgroundScheduler(timezone="UTC")
286
- # scheduler.add_job(restart_space_job, "interval", hours=1)
287
- # scheduler.start()
288
 
289
 
290
  # Launch the Gradio app
291
  if __name__ == "__main__":
292
- print("=" * 80, file=sys.stderr)
293
- print("READY TO LAUNCH GRADIO APP", file=sys.stderr)
294
- print("=" * 80, file=sys.stderr)
295
  if LOCAL_DEBUG:
296
- print("Launching in LOCAL_DEBUG mode...", file=sys.stderr)
297
  demo.launch(debug=True, allowed_paths=["assets"], favicon_path="assets/favicon/favicon.ico")
298
  else:
299
- print("Launching in Space mode...", file=sys.stderr)
300
- # For Spaces, share=False is typical unless specific tunneling is needed.
301
- # debug=True can be set to False for a "production" Space.
302
  demo.launch(server_name="0.0.0.0", server_port=7860, debug=True, share=False, allowed_paths=["assets"], favicon_path="assets/favicon/favicon.ico")
303
- print("Gradio app launched successfully!", file=sys.stderr)
304
 
 
2
  import logging
3
  import sys
4
 
5
+ logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
6
+ logger = logging.getLogger(__name__)
7
 
8
+ logger.info("Starting OpenHands Index application")
 
 
9
 
10
  # Setup mock data before anything else
11
  try:
 
12
  from setup_data import setup_mock_data
 
13
  setup_mock_data()
14
+ logger.info("Data setup completed successfully")
15
  except Exception as e:
16
+ logger.error(f"Error during data setup: {e}", exc_info=True)
17
+ logger.warning("Continuing with app startup despite error")
 
 
18
 
 
19
  import gradio as gr
 
20
  import urllib.parse
 
 
 
21
  from huggingface_hub import HfApi
 
 
 
22
  from config import LEADERBOARD_PATH, LOCAL_DEBUG
 
 
 
23
  from content import css
 
24
  from main_page import build_page as build_main_page
 
25
  from bug_fixing import build_page as build_bug_fixing_page
 
26
  from app_creation import build_page as build_app_creation_page
 
27
  from frontend_development import build_page as build_frontend_page
 
28
  from test_generation import build_page as build_test_generation_page
 
29
  from information_gathering import build_page as build_information_gathering_page
 
30
  from about import build_page as build_about_page
31
+
32
+ logger.info(f"All modules imported (LOCAL_DEBUG={LOCAL_DEBUG})")
33
 
34
  api = HfApi()
35
  LOGO_PATH = "assets/logo.svg"
 
171
  encoded_svg = urllib.parse.quote(svg_content)
172
  home_icon_data_uri = f"data:image/svg+xml,{encoded_svg}"
173
  except FileNotFoundError:
174
+ logger.warning(f"Home icon file not found at {LOGO_PATH}")
175
  home_icon_data_uri = "none"
176
 
177
  # --- This is the final CSS ---
 
206
  }}
207
  """
208
  # --- Gradio App Definition ---
209
+ logger.info("Creating Gradio application")
210
  demo = gr.Blocks(
211
  theme=theme,
212
  css=final_css,
213
  head=scroll_script + redirect_script + tooltip_script,
214
  title="OpenHands Index",
215
  )
 
216
 
 
217
  with demo.route("Home", "/home"):
218
  build_main_page()
 
219
 
 
220
  with demo.route("Bug Fixing", "/bug-fixing"):
221
  build_bug_fixing_page()
 
222
 
 
223
  with demo.route("App Creation", "/app-creation"):
224
  build_app_creation_page()
 
225
 
 
226
  with demo.route("Frontend Development", "/frontend-development"):
227
  build_frontend_page()
 
228
 
 
229
  with demo.route("Test Generation", "/test-generation"):
230
  build_test_generation_page()
 
231
 
 
232
  with demo.route("Information Gathering", "/information-gathering"):
233
  build_information_gathering_page()
 
234
 
 
235
  with demo.route("About", "/about"):
236
  build_about_page()
 
 
 
 
 
 
 
 
 
 
237
 
238
+ logger.info("All routes configured")
 
 
 
239
 
240
 
241
  # Launch the Gradio app
242
  if __name__ == "__main__":
 
 
 
243
  if LOCAL_DEBUG:
244
+ logger.info("Launching in LOCAL_DEBUG mode")
245
  demo.launch(debug=True, allowed_paths=["assets"], favicon_path="assets/favicon/favicon.ico")
246
  else:
247
+ logger.info("Launching in Space mode")
 
 
248
  demo.launch(server_name="0.0.0.0", server_port=7860, debug=True, share=False, allowed_paths=["assets"], favicon_path="assets/favicon/favicon.ico")
249
+ logger.info("Gradio app launched successfully")
250
 
content.py CHANGED
@@ -16,6 +16,10 @@ def create_gradio_anchor_id(text: str, validation) -> str:
16
  TITLE = """<h1 align="left" id="space-title">OpenHands Index</h1>"""
17
 
18
  INTRO_PARAGRAPH = """
 
 
 
 
19
  <p>
20
  <strong>OpenHands Index</strong> provides an aggregated view of agent performance and efficiency across all benchmarks in all categories. We report:
21
  </p>
 
16
  TITLE = """<h1 align="left" id="space-title">OpenHands Index</h1>"""
17
 
18
  INTRO_PARAGRAPH = """
19
+ <p>
20
+ <b>Pre-release:</b> this codebase is not yet released.
21
+ </p>
22
+
23
  <p>
24
  <strong>OpenHands Index</strong> provides an aggregated view of agent performance and efficiency across all benchmarks in all categories. We report:
25
  </p>
github_data_loader.py DELETED
@@ -1,71 +0,0 @@
1
- """
2
- Custom data loader for OpenHands Index that fetches from GitHub instead of HF datasets.
3
- Mimics the interface of LeaderboardViewer from agent-eval.
4
- """
5
- import pandas as pd
6
- import requests
7
- from typing import Dict, List, Tuple
8
-
9
-
10
- class GitHubDataLoader:
11
- """Loads leaderboard data from GitHub repository."""
12
-
13
- def __init__(self, base_url: str, split: str):
14
- self.base_url = base_url
15
- self.split = split
16
- self.tag_map = self._build_tag_map()
17
-
18
- def _build_tag_map(self) -> Dict[str, List[str]]:
19
- """Build tag map for the OpenHands datasets."""
20
- # Map datasets to their respective tags
21
- return {
22
- "swe-bench": ["swe-bench"],
23
- "multi-swe-bench": ["multi-swe-bench"],
24
- "swe-bench-multimodal": ["swe-bench-multimodal"],
25
- "swt-bench": ["swt-bench"],
26
- "commit0": ["commit0"],
27
- "gaia": ["gaia"],
28
- }
29
-
30
- def _load(self) -> Tuple[pd.DataFrame, Dict]:
31
- """Load and combine data from all GitHub JSON files."""
32
- all_results = []
33
-
34
- datasets = ["swe-bench", "multi-swe-bench", "swe-bench-multimodal",
35
- "swt-bench", "commit0", "gaia"]
36
-
37
- for dataset in datasets:
38
- url = f"{self.base_url}/{dataset}.json"
39
- try:
40
- response = requests.get(url, timeout=10)
41
- if response.status_code == 200:
42
- data = response.json()
43
- # Transform GitHub data to match agenteval format
44
- for entry in data:
45
- all_results.append({
46
- "agent_name": entry.get("agent_name", "Unknown"),
47
- "score": entry.get("score", 0.0),
48
- "dataset": dataset,
49
- "split": self.split,
50
- # Add other fields as needed
51
- })
52
- except Exception as e:
53
- print(f"Warning: Could not load data from {url}: {e}")
54
- continue
55
-
56
- if all_results:
57
- df = pd.DataFrame(all_results)
58
- return df, self.tag_map
59
- else:
60
- return pd.DataFrame(), self.tag_map
61
-
62
-
63
- class DummyViewer:
64
- """Fallback viewer when data loading fails."""
65
-
66
- def __init__(self, df: pd.DataFrame):
67
- self._df = df
68
- self.tag_map = {"Overall": []}
69
-
70
- def _load(self) -> Tuple[pd.DataFrame, Dict]:
71
- return self._df, self.tag_map
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
mock_results/1.0.0-dev1/agenteval.json DELETED
@@ -1,74 +0,0 @@
1
- {
2
- "suite_config": {
3
- "name": "openhands-index",
4
- "version": "1.0.0-dev1",
5
- "splits": [
6
- {
7
- "name": "swe-bench",
8
- "tasks": [
9
- {
10
- "name": "swe-bench",
11
- "tags": [
12
- "swe-bench"
13
- ]
14
- }
15
- ]
16
- },
17
- {
18
- "name": "multi-swe-bench",
19
- "tasks": [
20
- {
21
- "name": "multi-swe-bench",
22
- "tags": [
23
- "multi-swe-bench"
24
- ]
25
- }
26
- ]
27
- },
28
- {
29
- "name": "swe-bench-multimodal",
30
- "tasks": [
31
- {
32
- "name": "swe-bench-multimodal",
33
- "tags": [
34
- "swe-bench-multimodal"
35
- ]
36
- }
37
- ]
38
- },
39
- {
40
- "name": "swt-bench",
41
- "tasks": [
42
- {
43
- "name": "swt-bench",
44
- "tags": [
45
- "swt-bench"
46
- ]
47
- }
48
- ]
49
- },
50
- {
51
- "name": "commit0",
52
- "tasks": [
53
- {
54
- "name": "commit0",
55
- "tags": [
56
- "commit0"
57
- ]
58
- }
59
- ]
60
- },
61
- {
62
- "name": "gaia",
63
- "tasks": [
64
- {
65
- "name": "gaia",
66
- "tags": [
67
- "gaia"
68
- ]
69
- }
70
- ]
71
- }
72
- ]
73
- }
74
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
mock_results/1.0.0-dev1/commit0.jsonl DELETED
@@ -1,5 +0,0 @@
1
- {"agent_name": "1.0.2", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed", "score": 71.2, "metric": "test_pass_rate", "submission_time": "2025-11-25T00:12:22.972910", "tags": ["commit0"], "total_cost": 45.6, "total_runtime": 656.0}
2
- {"agent_name": "1.0.1", "llm_base": "gpt-4o-2024-11-20", "openness": "closed", "score": 68.9, "metric": "test_pass_rate", "submission_time": "2025-11-25T00:12:22.972929", "tags": ["commit0"], "total_cost": 44.45, "total_runtime": 644.5}
3
- {"agent_name": "1.0.0", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed", "score": 61.5, "metric": "test_pass_rate", "submission_time": "2025-11-25T00:12:22.972939", "tags": ["commit0"], "total_cost": 40.75, "total_runtime": 607.5}
4
- {"agent_name": "0.9.5", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed", "score": 55.3, "metric": "test_pass_rate", "submission_time": "2025-11-25T00:12:22.972947", "tags": ["commit0"], "total_cost": 37.65, "total_runtime": 576.5}
5
- {"agent_name": "0.9.0", "llm_base": "claude-3-opus-20240229", "openness": "closed", "score": 52.1, "metric": "test_pass_rate", "submission_time": "2025-11-25T00:12:22.972954", "tags": ["commit0"], "total_cost": 36.05, "total_runtime": 560.5}
 
 
 
 
 
 
mock_results/1.0.0-dev1/gaia.jsonl DELETED
@@ -1,5 +0,0 @@
1
- {"agent_name": "1.0.2", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed", "score": 58.7, "metric": "accuracy", "submission_time": "2025-11-25T00:12:22.973093", "tags": ["gaia"], "total_cost": 39.35, "total_runtime": 593.5}
2
- {"agent_name": "1.0.1", "llm_base": "gpt-4o-2024-11-20", "openness": "closed", "score": 55.2, "metric": "accuracy", "submission_time": "2025-11-25T00:12:22.973111", "tags": ["gaia"], "total_cost": 37.6, "total_runtime": 576.0}
3
- {"agent_name": "1.0.0", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed", "score": 48.3, "metric": "accuracy", "submission_time": "2025-11-25T00:12:22.973121", "tags": ["gaia"], "total_cost": 34.15, "total_runtime": 541.5}
4
- {"agent_name": "0.9.5", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed", "score": 42.1, "metric": "accuracy", "submission_time": "2025-11-25T00:12:22.973129", "tags": ["gaia"], "total_cost": 31.05, "total_runtime": 510.5}
5
- {"agent_name": "0.9.0", "llm_base": "claude-3-opus-20240229", "openness": "closed", "score": 39.4, "metric": "accuracy", "submission_time": "2025-11-25T00:12:22.973137", "tags": ["gaia"], "total_cost": 29.7, "total_runtime": 497.0}
 
 
 
 
 
 
mock_results/1.0.0-dev1/multi-swe-bench.jsonl DELETED
@@ -1,5 +0,0 @@
1
- {"agent_name": "1.0.2", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed", "score": 35.2, "metric": "resolve_rate", "submission_time": "2025-11-25T00:12:22.972368", "tags": ["multi-swe-bench"], "total_cost": 27.6, "total_runtime": 476.0}
2
- {"agent_name": "1.0.1", "llm_base": "gpt-4o-2024-11-20", "openness": "closed", "score": 32.8, "metric": "resolve_rate", "submission_time": "2025-11-25T00:12:22.972389", "tags": ["multi-swe-bench"], "total_cost": 26.4, "total_runtime": 464.0}
3
- {"agent_name": "1.0.0", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed", "score": 28.4, "metric": "resolve_rate", "submission_time": "2025-11-25T00:12:22.972400", "tags": ["multi-swe-bench"], "total_cost": 24.2, "total_runtime": 442.0}
4
- {"agent_name": "0.9.5", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed", "score": 24.1, "metric": "resolve_rate", "submission_time": "2025-11-25T00:12:22.972408", "tags": ["multi-swe-bench"], "total_cost": 22.05, "total_runtime": 420.5}
5
- {"agent_name": "0.9.0", "llm_base": "claude-3-opus-20240229", "openness": "closed", "score": 21.5, "metric": "resolve_rate", "submission_time": "2025-11-25T00:12:22.972416", "tags": ["multi-swe-bench"], "total_cost": 20.75, "total_runtime": 407.5}
 
 
 
 
 
 
mock_results/1.0.0-dev1/results/20251124_claude_3_5_sonnet_20241022/metadata.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "agent_name": "OpenHands CodeAct v2.1",
3
+ "agent_version": "OpenHands CodeAct v2.1",
4
+ "model": "claude-3-5-sonnet-20241022",
5
+ "openness": "closed_api_available",
6
+ "tool_usage": "standard",
7
+ "submission_time": "2025-11-24T19:56:00.092865",
8
+ "directory_name": "20251124_claude_3_5_sonnet_20241022"
9
+ }
mock_results/1.0.0-dev1/results/20251124_claude_3_5_sonnet_20241022/scores.json ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "benchmark": "swe-bench",
4
+ "score": 48.3,
5
+ "metric": "resolve_rate",
6
+ "total_cost": 34.15,
7
+ "total_runtime": 541.5,
8
+ "tags": [
9
+ "swe-bench"
10
+ ]
11
+ },
12
+ {
13
+ "benchmark": "swe-bench-multimodal",
14
+ "score": 42.1,
15
+ "metric": "resolve_rate",
16
+ "total_cost": 31.05,
17
+ "total_runtime": 510.5,
18
+ "tags": [
19
+ "swe-bench-multimodal"
20
+ ]
21
+ },
22
+ {
23
+ "benchmark": "commit0",
24
+ "score": 71.2,
25
+ "metric": "test_pass_rate",
26
+ "total_cost": 45.6,
27
+ "total_runtime": 656.0,
28
+ "tags": [
29
+ "commit0"
30
+ ]
31
+ },
32
+ {
33
+ "benchmark": "multi-swe-bench",
34
+ "score": 35.2,
35
+ "metric": "resolve_rate",
36
+ "total_cost": 27.6,
37
+ "total_runtime": 476.0,
38
+ "tags": [
39
+ "multi-swe-bench"
40
+ ]
41
+ },
42
+ {
43
+ "benchmark": "swt-bench",
44
+ "score": 65.4,
45
+ "metric": "success_rate",
46
+ "total_cost": 42.7,
47
+ "total_runtime": 627.0,
48
+ "tags": [
49
+ "swt-bench"
50
+ ]
51
+ },
52
+ {
53
+ "benchmark": "gaia",
54
+ "score": 58.7,
55
+ "metric": "accuracy",
56
+ "total_cost": 39.35,
57
+ "total_runtime": 593.5,
58
+ "tags": [
59
+ "gaia"
60
+ ]
61
+ }
62
+ ]
mock_results/1.0.0-dev1/results/20251124_claude_3_opus_20240229/metadata.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "agent_name": "SWE-Agent",
3
+ "agent_version": "SWE-Agent",
4
+ "model": "claude-3-opus-20240229",
5
+ "openness": "closed_api_available",
6
+ "tool_usage": "custom_interface",
7
+ "submission_time": "2025-11-24T19:56:00.092922",
8
+ "directory_name": "20251124_claude_3_opus_20240229"
9
+ }
mock_results/1.0.0-dev1/results/20251124_claude_3_opus_20240229/scores.json ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "benchmark": "swe-bench",
4
+ "score": 29.8,
5
+ "metric": "resolve_rate",
6
+ "total_cost": 24.9,
7
+ "total_runtime": 449.0,
8
+ "tags": [
9
+ "swe-bench"
10
+ ]
11
+ },
12
+ {
13
+ "benchmark": "swe-bench-multimodal",
14
+ "score": 25.7,
15
+ "metric": "resolve_rate",
16
+ "total_cost": 22.85,
17
+ "total_runtime": 428.5,
18
+ "tags": [
19
+ "swe-bench-multimodal"
20
+ ]
21
+ },
22
+ {
23
+ "benchmark": "commit0",
24
+ "score": 52.1,
25
+ "metric": "test_pass_rate",
26
+ "total_cost": 36.05,
27
+ "total_runtime": 560.5,
28
+ "tags": [
29
+ "commit0"
30
+ ]
31
+ },
32
+ {
33
+ "benchmark": "multi-swe-bench",
34
+ "score": 21.5,
35
+ "metric": "resolve_rate",
36
+ "total_cost": 20.75,
37
+ "total_runtime": 407.5,
38
+ "tags": [
39
+ "multi-swe-bench"
40
+ ]
41
+ },
42
+ {
43
+ "benchmark": "swt-bench",
44
+ "score": 44.2,
45
+ "metric": "success_rate",
46
+ "total_cost": 32.1,
47
+ "total_runtime": 521.0,
48
+ "tags": [
49
+ "swt-bench"
50
+ ]
51
+ },
52
+ {
53
+ "benchmark": "gaia",
54
+ "score": 39.4,
55
+ "metric": "accuracy",
56
+ "total_cost": 29.7,
57
+ "total_runtime": 497.0,
58
+ "tags": [
59
+ "gaia"
60
+ ]
61
+ }
62
+ ]
mock_results/1.0.0-dev1/results/20251124_gpt_4_turbo_2024_04_09/metadata.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "agent_name": "AutoCodeRover",
3
+ "agent_version": "AutoCodeRover",
4
+ "model": "gpt-4-turbo-2024-04-09",
5
+ "openness": "closed_api_available",
6
+ "tool_usage": "standard",
7
+ "submission_time": "2025-11-24T19:56:00.092908",
8
+ "directory_name": "20251124_gpt_4_turbo_2024_04_09"
9
+ }
mock_results/1.0.0-dev1/results/20251124_gpt_4_turbo_2024_04_09/scores.json ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "benchmark": "swe-bench",
4
+ "score": 38.7,
5
+ "metric": "resolve_rate",
6
+ "total_cost": 29.35,
7
+ "total_runtime": 493.5,
8
+ "tags": [
9
+ "swe-bench"
10
+ ]
11
+ },
12
+ {
13
+ "benchmark": "swe-bench-multimodal",
14
+ "score": 34.2,
15
+ "metric": "resolve_rate",
16
+ "total_cost": 27.1,
17
+ "total_runtime": 471.0,
18
+ "tags": [
19
+ "swe-bench-multimodal"
20
+ ]
21
+ },
22
+ {
23
+ "benchmark": "commit0",
24
+ "score": 61.5,
25
+ "metric": "test_pass_rate",
26
+ "total_cost": 40.75,
27
+ "total_runtime": 607.5,
28
+ "tags": [
29
+ "commit0"
30
+ ]
31
+ },
32
+ {
33
+ "benchmark": "multi-swe-bench",
34
+ "score": 28.4,
35
+ "metric": "resolve_rate",
36
+ "total_cost": 24.2,
37
+ "total_runtime": 442.0,
38
+ "tags": [
39
+ "multi-swe-bench"
40
+ ]
41
+ },
42
+ {
43
+ "benchmark": "swt-bench",
44
+ "score": 54.1,
45
+ "metric": "success_rate",
46
+ "total_cost": 37.05,
47
+ "total_runtime": 570.5,
48
+ "tags": [
49
+ "swt-bench"
50
+ ]
51
+ },
52
+ {
53
+ "benchmark": "gaia",
54
+ "score": 48.3,
55
+ "metric": "accuracy",
56
+ "total_cost": 34.15,
57
+ "total_runtime": 541.5,
58
+ "tags": [
59
+ "gaia"
60
+ ]
61
+ }
62
+ ]
mock_results/1.0.0-dev1/results/20251124_gpt_4o_2024_11_20/metadata.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "agent_name": "OpenHands CodeAct v2.0",
3
+ "agent_version": "OpenHands CodeAct v2.0",
4
+ "model": "gpt-4o-2024-11-20",
5
+ "openness": "closed_api_available",
6
+ "tool_usage": "standard",
7
+ "submission_time": "2025-11-24T19:56:00.092895",
8
+ "directory_name": "20251124_gpt_4o_2024_11_20"
9
+ }
mock_results/1.0.0-dev1/results/20251124_gpt_4o_2024_11_20/scores.json ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "benchmark": "swe-bench",
4
+ "score": 45.1,
5
+ "metric": "resolve_rate",
6
+ "total_cost": 32.55,
7
+ "total_runtime": 525.5,
8
+ "tags": [
9
+ "swe-bench"
10
+ ]
11
+ },
12
+ {
13
+ "benchmark": "swe-bench-multimodal",
14
+ "score": 39.5,
15
+ "metric": "resolve_rate",
16
+ "total_cost": 29.75,
17
+ "total_runtime": 497.5,
18
+ "tags": [
19
+ "swe-bench-multimodal"
20
+ ]
21
+ },
22
+ {
23
+ "benchmark": "commit0",
24
+ "score": 68.9,
25
+ "metric": "test_pass_rate",
26
+ "total_cost": 44.45,
27
+ "total_runtime": 644.5,
28
+ "tags": [
29
+ "commit0"
30
+ ]
31
+ },
32
+ {
33
+ "benchmark": "multi-swe-bench",
34
+ "score": 32.8,
35
+ "metric": "resolve_rate",
36
+ "total_cost": 26.4,
37
+ "total_runtime": 464.0,
38
+ "tags": [
39
+ "multi-swe-bench"
40
+ ]
41
+ },
42
+ {
43
+ "benchmark": "swt-bench",
44
+ "score": 62.3,
45
+ "metric": "success_rate",
46
+ "total_cost": 41.15,
47
+ "total_runtime": 611.5,
48
+ "tags": [
49
+ "swt-bench"
50
+ ]
51
+ },
52
+ {
53
+ "benchmark": "gaia",
54
+ "score": 55.2,
55
+ "metric": "accuracy",
56
+ "total_cost": 37.6,
57
+ "total_runtime": 576.0,
58
+ "tags": [
59
+ "gaia"
60
+ ]
61
+ }
62
+ ]
mock_results/1.0.0-dev1/results/20251124_gpt_4o_mini_2024_07_18/metadata.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "agent_name": "Agentless",
3
+ "agent_version": "Agentless",
4
+ "model": "gpt-4o-mini-2024-07-18",
5
+ "openness": "closed_api_available",
6
+ "tool_usage": "standard",
7
+ "submission_time": "2025-11-24T19:56:00.092916",
8
+ "directory_name": "20251124_gpt_4o_mini_2024_07_18"
9
+ }
mock_results/1.0.0-dev1/results/20251124_gpt_4o_mini_2024_07_18/scores.json ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "benchmark": "swe-bench",
4
+ "score": 32.5,
5
+ "metric": "resolve_rate",
6
+ "total_cost": 26.25,
7
+ "total_runtime": 462.5,
8
+ "tags": [
9
+ "swe-bench"
10
+ ]
11
+ },
12
+ {
13
+ "benchmark": "swe-bench-multimodal",
14
+ "score": 28.9,
15
+ "metric": "resolve_rate",
16
+ "total_cost": 24.45,
17
+ "total_runtime": 444.5,
18
+ "tags": [
19
+ "swe-bench-multimodal"
20
+ ]
21
+ },
22
+ {
23
+ "benchmark": "commit0",
24
+ "score": 55.3,
25
+ "metric": "test_pass_rate",
26
+ "total_cost": 37.65,
27
+ "total_runtime": 576.5,
28
+ "tags": [
29
+ "commit0"
30
+ ]
31
+ },
32
+ {
33
+ "benchmark": "multi-swe-bench",
34
+ "score": 24.1,
35
+ "metric": "resolve_rate",
36
+ "total_cost": 22.05,
37
+ "total_runtime": 420.5,
38
+ "tags": [
39
+ "multi-swe-bench"
40
+ ]
41
+ },
42
+ {
43
+ "benchmark": "swt-bench",
44
+ "score": 47.8,
45
+ "metric": "success_rate",
46
+ "total_cost": 33.9,
47
+ "total_runtime": 539.0,
48
+ "tags": [
49
+ "swt-bench"
50
+ ]
51
+ },
52
+ {
53
+ "benchmark": "gaia",
54
+ "score": 42.1,
55
+ "metric": "accuracy",
56
+ "total_cost": 31.05,
57
+ "total_runtime": 510.5,
58
+ "tags": [
59
+ "gaia"
60
+ ]
61
+ }
62
+ ]
mock_results/1.0.0-dev1/swe-bench-multimodal.jsonl DELETED
@@ -1,5 +0,0 @@
1
- {"agent_name": "1.0.2", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed", "score": 42.1, "metric": "resolve_rate", "submission_time": "2025-11-25T00:12:22.972550", "tags": ["swe-bench-multimodal"], "total_cost": 31.05, "total_runtime": 510.5}
2
- {"agent_name": "1.0.1", "llm_base": "gpt-4o-2024-11-20", "openness": "closed", "score": 39.5, "metric": "resolve_rate", "submission_time": "2025-11-25T00:12:22.972567", "tags": ["swe-bench-multimodal"], "total_cost": 29.75, "total_runtime": 497.5}
3
- {"agent_name": "1.0.0", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed", "score": 34.2, "metric": "resolve_rate", "submission_time": "2025-11-25T00:12:22.972577", "tags": ["swe-bench-multimodal"], "total_cost": 27.1, "total_runtime": 471.0}
4
- {"agent_name": "0.9.5", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed", "score": 28.9, "metric": "resolve_rate", "submission_time": "2025-11-25T00:12:22.972585", "tags": ["swe-bench-multimodal"], "total_cost": 24.45, "total_runtime": 444.5}
5
- {"agent_name": "0.9.0", "llm_base": "claude-3-opus-20240229", "openness": "closed", "score": 25.7, "metric": "resolve_rate", "submission_time": "2025-11-25T00:12:22.972593", "tags": ["swe-bench-multimodal"], "total_cost": 22.85, "total_runtime": 428.5}
 
 
 
 
 
 
mock_results/1.0.0-dev1/swe-bench.jsonl DELETED
@@ -1,5 +0,0 @@
1
- {"agent_name": "1.0.2", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed", "score": 48.3, "metric": "resolve_rate", "submission_time": "2025-11-25T00:12:22.972101", "tags": ["swe-bench"], "total_cost": 34.15, "total_runtime": 541.5}
2
- {"agent_name": "1.0.1", "llm_base": "gpt-4o-2024-11-20", "openness": "closed", "score": 45.1, "metric": "resolve_rate", "submission_time": "2025-11-25T00:12:22.972136", "tags": ["swe-bench"], "total_cost": 32.55, "total_runtime": 525.5}
3
- {"agent_name": "1.0.0", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed", "score": 38.7, "metric": "resolve_rate", "submission_time": "2025-11-25T00:12:22.972167", "tags": ["swe-bench"], "total_cost": 29.35, "total_runtime": 493.5}
4
- {"agent_name": "0.9.5", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed", "score": 32.5, "metric": "resolve_rate", "submission_time": "2025-11-25T00:12:22.972178", "tags": ["swe-bench"], "total_cost": 26.25, "total_runtime": 462.5}
5
- {"agent_name": "0.9.0", "llm_base": "claude-3-opus-20240229", "openness": "closed", "score": 29.8, "metric": "resolve_rate", "submission_time": "2025-11-25T00:12:22.972186", "tags": ["swe-bench"], "total_cost": 24.9, "total_runtime": 449.0}
 
 
 
 
 
 
mock_results/1.0.0-dev1/swt-bench.jsonl DELETED
@@ -1,5 +0,0 @@
1
- {"agent_name": "1.0.2", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed", "score": 65.4, "metric": "success_rate", "submission_time": "2025-11-25T00:12:22.972724", "tags": ["swt-bench"], "total_cost": 42.7, "total_runtime": 627.0}
2
- {"agent_name": "1.0.1", "llm_base": "gpt-4o-2024-11-20", "openness": "closed", "score": 62.3, "metric": "success_rate", "submission_time": "2025-11-25T00:12:22.972741", "tags": ["swt-bench"], "total_cost": 41.15, "total_runtime": 611.5}
3
- {"agent_name": "1.0.0", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed", "score": 54.1, "metric": "success_rate", "submission_time": "2025-11-25T00:12:22.972750", "tags": ["swt-bench"], "total_cost": 37.05, "total_runtime": 570.5}
4
- {"agent_name": "0.9.5", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed", "score": 47.8, "metric": "success_rate", "submission_time": "2025-11-25T00:12:22.972758", "tags": ["swt-bench"], "total_cost": 33.9, "total_runtime": 539.0}
5
- {"agent_name": "0.9.0", "llm_base": "claude-3-opus-20240229", "openness": "closed", "score": 44.2, "metric": "success_rate", "submission_time": "2025-11-25T00:12:22.972765", "tags": ["swt-bench"], "total_cost": 32.1, "total_runtime": 521.0}
 
 
 
 
 
 
mock_results/1.0.0-dev1/test.jsonl DELETED
@@ -1,30 +0,0 @@
1
- {"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 48.3, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.092865", "tags": ["swe-bench"], "total_cost": 34.15, "total_runtime": 541.5}
2
- {"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 45.1, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.092895", "tags": ["swe-bench"], "total_cost": 32.55, "total_runtime": 525.5}
3
- {"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 38.7, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.092908", "tags": ["swe-bench"], "total_cost": 29.35, "total_runtime": 493.5}
4
- {"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 32.5, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.092916", "tags": ["swe-bench"], "total_cost": 26.25, "total_runtime": 462.5}
5
- {"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 29.8, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.092922", "tags": ["swe-bench"], "total_cost": 24.9, "total_runtime": 449.0}
6
- {"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 35.2, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093026", "tags": ["multi-swe-bench"], "total_cost": 27.6, "total_runtime": 476.0}
7
- {"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 32.8, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093040", "tags": ["multi-swe-bench"], "total_cost": 26.4, "total_runtime": 464.0}
8
- {"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 28.4, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093048", "tags": ["multi-swe-bench"], "total_cost": 24.2, "total_runtime": 442.0}
9
- {"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 24.1, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093058", "tags": ["multi-swe-bench"], "total_cost": 22.05, "total_runtime": 420.5}
10
- {"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 21.5, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093067", "tags": ["multi-swe-bench"], "total_cost": 20.75, "total_runtime": 407.5}
11
- {"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 42.1, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093172", "tags": ["swe-bench-multimodal"], "total_cost": 31.05, "total_runtime": 510.5}
12
- {"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 39.5, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093187", "tags": ["swe-bench-multimodal"], "total_cost": 29.75, "total_runtime": 497.5}
13
- {"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 34.2, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093200", "tags": ["swe-bench-multimodal"], "total_cost": 27.1, "total_runtime": 471.0}
14
- {"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 28.9, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093208", "tags": ["swe-bench-multimodal"], "total_cost": 24.45, "total_runtime": 444.5}
15
- {"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 25.7, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093218", "tags": ["swe-bench-multimodal"], "total_cost": 22.85, "total_runtime": 428.5}
16
- {"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 65.4, "metric": "success_rate", "submission_time": "2025-11-24T19:56:00.093304", "tags": ["swt-bench"], "total_cost": 42.7, "total_runtime": 627.0}
17
- {"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 62.3, "metric": "success_rate", "submission_time": "2025-11-24T19:56:00.093317", "tags": ["swt-bench"], "total_cost": 41.15, "total_runtime": 611.5}
18
- {"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 54.1, "metric": "success_rate", "submission_time": "2025-11-24T19:56:00.093326", "tags": ["swt-bench"], "total_cost": 37.05, "total_runtime": 570.5}
19
- {"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 47.8, "metric": "success_rate", "submission_time": "2025-11-24T19:56:00.093333", "tags": ["swt-bench"], "total_cost": 33.9, "total_runtime": 539.0}
20
- {"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 44.2, "metric": "success_rate", "submission_time": "2025-11-24T19:56:00.093338", "tags": ["swt-bench"], "total_cost": 32.1, "total_runtime": 521.0}
21
- {"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 71.2, "metric": "test_pass_rate", "submission_time": "2025-11-24T19:56:00.093416", "tags": ["commit0"], "total_cost": 45.6, "total_runtime": 656.0}
22
- {"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 68.9, "metric": "test_pass_rate", "submission_time": "2025-11-24T19:56:00.093428", "tags": ["commit0"], "total_cost": 44.45, "total_runtime": 644.5}
23
- {"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 61.5, "metric": "test_pass_rate", "submission_time": "2025-11-24T19:56:00.093436", "tags": ["commit0"], "total_cost": 40.75, "total_runtime": 607.5}
24
- {"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 55.3, "metric": "test_pass_rate", "submission_time": "2025-11-24T19:56:00.093445", "tags": ["commit0"], "total_cost": 37.65, "total_runtime": 576.5}
25
- {"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 52.1, "metric": "test_pass_rate", "submission_time": "2025-11-24T19:56:00.093450", "tags": ["commit0"], "total_cost": 36.05, "total_runtime": 560.5}
26
- {"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 58.7, "metric": "accuracy", "submission_time": "2025-11-24T19:56:00.093541", "tags": ["gaia"], "total_cost": 39.35, "total_runtime": 593.5}
27
- {"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 55.2, "metric": "accuracy", "submission_time": "2025-11-24T19:56:00.093556", "tags": ["gaia"], "total_cost": 37.6, "total_runtime": 576.0}
28
- {"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 48.3, "metric": "accuracy", "submission_time": "2025-11-24T19:56:00.093563", "tags": ["gaia"], "total_cost": 34.15, "total_runtime": 541.5}
29
- {"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 42.1, "metric": "accuracy", "submission_time": "2025-11-24T19:56:00.093568", "tags": ["gaia"], "total_cost": 31.05, "total_runtime": 510.5}
30
- {"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 39.4, "metric": "accuracy", "submission_time": "2025-11-24T19:56:00.093574", "tags": ["gaia"], "total_cost": 29.7, "total_runtime": 497.0}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
mock_results/1.0.0-dev1/test.parquet DELETED
Binary file (9.26 kB)
 
mock_results/1.0.0-dev1/validation.jsonl DELETED
@@ -1,30 +0,0 @@
1
- {"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 48.3, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.092865", "tags": ["swe-bench"], "total_cost": 34.15, "total_runtime": 541.5}
2
- {"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 45.1, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.092895", "tags": ["swe-bench"], "total_cost": 32.55, "total_runtime": 525.5}
3
- {"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 38.7, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.092908", "tags": ["swe-bench"], "total_cost": 29.35, "total_runtime": 493.5}
4
- {"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 32.5, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.092916", "tags": ["swe-bench"], "total_cost": 26.25, "total_runtime": 462.5}
5
- {"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 29.8, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.092922", "tags": ["swe-bench"], "total_cost": 24.9, "total_runtime": 449.0}
6
- {"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 35.2, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093026", "tags": ["multi-swe-bench"], "total_cost": 27.6, "total_runtime": 476.0}
7
- {"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 32.8, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093040", "tags": ["multi-swe-bench"], "total_cost": 26.4, "total_runtime": 464.0}
8
- {"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 28.4, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093048", "tags": ["multi-swe-bench"], "total_cost": 24.2, "total_runtime": 442.0}
9
- {"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 24.1, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093058", "tags": ["multi-swe-bench"], "total_cost": 22.05, "total_runtime": 420.5}
10
- {"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 21.5, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093067", "tags": ["multi-swe-bench"], "total_cost": 20.75, "total_runtime": 407.5}
11
- {"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 42.1, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093172", "tags": ["swe-bench-multimodal"], "total_cost": 31.05, "total_runtime": 510.5}
12
- {"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 39.5, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093187", "tags": ["swe-bench-multimodal"], "total_cost": 29.75, "total_runtime": 497.5}
13
- {"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 34.2, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093200", "tags": ["swe-bench-multimodal"], "total_cost": 27.1, "total_runtime": 471.0}
14
- {"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 28.9, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093208", "tags": ["swe-bench-multimodal"], "total_cost": 24.45, "total_runtime": 444.5}
15
- {"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 25.7, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093218", "tags": ["swe-bench-multimodal"], "total_cost": 22.85, "total_runtime": 428.5}
16
- {"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 65.4, "metric": "success_rate", "submission_time": "2025-11-24T19:56:00.093304", "tags": ["swt-bench"], "total_cost": 42.7, "total_runtime": 627.0}
17
- {"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 62.3, "metric": "success_rate", "submission_time": "2025-11-24T19:56:00.093317", "tags": ["swt-bench"], "total_cost": 41.15, "total_runtime": 611.5}
18
- {"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 54.1, "metric": "success_rate", "submission_time": "2025-11-24T19:56:00.093326", "tags": ["swt-bench"], "total_cost": 37.05, "total_runtime": 570.5}
19
- {"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 47.8, "metric": "success_rate", "submission_time": "2025-11-24T19:56:00.093333", "tags": ["swt-bench"], "total_cost": 33.9, "total_runtime": 539.0}
20
- {"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 44.2, "metric": "success_rate", "submission_time": "2025-11-24T19:56:00.093338", "tags": ["swt-bench"], "total_cost": 32.1, "total_runtime": 521.0}
21
- {"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 71.2, "metric": "test_pass_rate", "submission_time": "2025-11-24T19:56:00.093416", "tags": ["commit0"], "total_cost": 45.6, "total_runtime": 656.0}
22
- {"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 68.9, "metric": "test_pass_rate", "submission_time": "2025-11-24T19:56:00.093428", "tags": ["commit0"], "total_cost": 44.45, "total_runtime": 644.5}
23
- {"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 61.5, "metric": "test_pass_rate", "submission_time": "2025-11-24T19:56:00.093436", "tags": ["commit0"], "total_cost": 40.75, "total_runtime": 607.5}
24
- {"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 55.3, "metric": "test_pass_rate", "submission_time": "2025-11-24T19:56:00.093445", "tags": ["commit0"], "total_cost": 37.65, "total_runtime": 576.5}
25
- {"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 52.1, "metric": "test_pass_rate", "submission_time": "2025-11-24T19:56:00.093450", "tags": ["commit0"], "total_cost": 36.05, "total_runtime": 560.5}
26
- {"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 58.7, "metric": "accuracy", "submission_time": "2025-11-24T19:56:00.093541", "tags": ["gaia"], "total_cost": 39.35, "total_runtime": 593.5}
27
- {"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 55.2, "metric": "accuracy", "submission_time": "2025-11-24T19:56:00.093556", "tags": ["gaia"], "total_cost": 37.6, "total_runtime": 576.0}
28
- {"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 48.3, "metric": "accuracy", "submission_time": "2025-11-24T19:56:00.093563", "tags": ["gaia"], "total_cost": 34.15, "total_runtime": 541.5}
29
- {"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 42.1, "metric": "accuracy", "submission_time": "2025-11-24T19:56:00.093568", "tags": ["gaia"], "total_cost": 31.05, "total_runtime": 510.5}
30
- {"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 39.4, "metric": "accuracy", "submission_time": "2025-11-24T19:56:00.093574", "tags": ["gaia"], "total_cost": 29.7, "total_runtime": 497.0}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
mock_results/1.0.0-dev1/validation.parquet DELETED
Binary file (9.29 kB)
 
requirements.txt CHANGED
@@ -4,7 +4,6 @@ pandas==2.2.3
4
  plotly==6.0.1
5
  requests==2.32.3
6
  huggingface-hub==0.30.2
7
- APScheduler==3.11.0
8
 
9
  # Additional dependencies for UI and processing
10
  matplotlib==3.10.3
 
4
  plotly==6.0.1
5
  requests==2.32.3
6
  huggingface-hub==0.30.2
 
7
 
8
  # Additional dependencies for UI and processing
9
  matplotlib==3.10.3
setup_data.py CHANGED
@@ -44,33 +44,29 @@ def fetch_data_from_github():
44
  return False
45
 
46
  # Look for data files in the cloned repository
47
- # Expected structure: openhands-index-results/{version}/test.jsonl, validation.jsonl, etc.
48
- data_source = temp_clone_dir / CONFIG_NAME
49
 
50
- if not data_source.exists():
51
- print(f"Data directory {data_source} not found in repository")
52
- # Try to find any version directories
53
- version_dirs = list(temp_clone_dir.glob("*.*.*"))
54
- if version_dirs:
55
- print(f"Found version directories: {[d.name for d in version_dirs]}")
56
- # Use the first available version
57
- data_source = version_dirs[0]
58
- print(f"Using data from {data_source.name}")
59
- else:
60
- print("No data found in repository")
61
- return False
62
 
63
- # Check if there are any JSONL files
64
- jsonl_files = list(data_source.glob("*.jsonl"))
65
- if not jsonl_files:
66
- print(f"No JSONL files found in {data_source}")
67
  return False
68
 
69
- # Create target directory and copy data
 
 
70
  os.makedirs(target_dir.parent, exist_ok=True)
71
  if target_dir.exists():
72
  shutil.rmtree(target_dir)
73
- shutil.copytree(data_source, target_dir)
 
 
 
74
 
75
  print(f"Successfully fetched data from GitHub. Files: {list(target_dir.glob('*'))}")
76
  return True
 
44
  return False
45
 
46
  # Look for data files in the cloned repository
47
+ # Expected structure: openhands-index-results/results/YYYYMMDD_model/
48
+ results_source = temp_clone_dir / "results"
49
 
50
+ if not results_source.exists():
51
+ print(f"Results directory not found in repository")
52
+ return False
 
 
 
 
 
 
 
 
 
53
 
54
+ # Check if there are any agent result directories
55
+ result_dirs = list(results_source.iterdir())
56
+ if not result_dirs:
57
+ print(f"No agent results found in {results_source}")
58
  return False
59
 
60
+ print(f"Found {len(result_dirs)} agent result directories")
61
+
62
+ # Create target directory and copy the results structure
63
  os.makedirs(target_dir.parent, exist_ok=True)
64
  if target_dir.exists():
65
  shutil.rmtree(target_dir)
66
+
67
+ # Copy the entire results directory
68
+ target_results = target_dir / "results"
69
+ shutil.copytree(results_source, target_results)
70
 
71
  print(f"Successfully fetched data from GitHub. Files: {list(target_dir.glob('*'))}")
72
  return True
simple_data_loader.py CHANGED
@@ -103,43 +103,16 @@ class SimpleLeaderboardViewer:
103
  return pd.DataFrame(all_records)
104
 
105
  def _load(self):
106
- """Load the JSONL file for the split and return DataFrame and tag map."""
107
- # Try new format first (agent-centric directories)
108
  df = self._load_from_agent_dirs()
109
 
110
  if df is None:
111
- # Fall back to old format (benchmark-centric JSONL)
112
- jsonl_file = self.config_path / f"{self.split}.jsonl"
113
-
114
- if not jsonl_file.exists():
115
- # Return empty dataframe with error message
116
- return pd.DataFrame({
117
- "Message": [f"No data found for split '{self.split}'. Expected file: {jsonl_file}"]
118
- }), {}
119
-
120
- try:
121
- # Read JSONL file
122
- records = []
123
- with open(jsonl_file, 'r') as f:
124
- for line in f:
125
- if line.strip():
126
- records.append(json.loads(line))
127
-
128
- if not records:
129
- return pd.DataFrame({
130
- "Message": [f"No data in file: {jsonl_file}"]
131
- }), {}
132
-
133
- # Convert to DataFrame
134
- df = pd.DataFrame(records)
135
- except Exception as e:
136
- import traceback
137
- traceback.print_exc()
138
- return pd.DataFrame({
139
- "Message": [f"Error loading data: {e}"]
140
- }), {}
141
 
142
- # Now process the dataframe (works for both old and new format)
143
  try:
144
 
145
  # Transform to expected format for leaderboard
 
103
  return pd.DataFrame(all_records)
104
 
105
  def _load(self):
106
+ """Load data from agent-centric directories and return DataFrame and tag map."""
 
107
  df = self._load_from_agent_dirs()
108
 
109
  if df is None:
110
+ # Return empty dataframe with error message
111
+ return pd.DataFrame({
112
+ "Message": [f"No data found for split '{self.split}' in results directory"]
113
+ }), {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
 
115
+ # Process the dataframe
116
  try:
117
 
118
  # Transform to expected format for leaderboard