openhands openhands commited on
Commit
0ee2099
·
1 Parent(s): 5e9c3b9

Update UI: All-Hands-AI color scheme, agent version column names, and OpenHands logo

Browse files

- Apply All-Hands-AI color scheme (yellow accent, indigo links, beige background)
- Change 'Agent' column to 'Agent Version'
- Remove 'Submitter' column
- Change 'Models Used' to 'Model'
- Update mock data with version numbers (1.0.2, 1.0.1, etc.)
- Replace logo with OpenHands branding

Co-authored-by: openhands <[email protected]>

assets/logo.svg CHANGED
content.py CHANGED
@@ -30,7 +30,7 @@ INTRO_PARAGRAPH = """
30
  </ul>
31
 
32
  <p>
33
- This view is designed for quick comparison of general-purpose coding agents. For more details on how we calculate scores and cost, please see the <a href="/about" style="color: #0FCB8C; text-decoration: underline;">About</a> Page.
34
  </p>
35
  """
36
  SCATTER_DISCLAIMER = """
@@ -237,14 +237,16 @@ def hf_uri_to_web_url(uri: str) -> str:
237
 
238
 
239
  css = """
240
- /* CSS Color Variables using Gradio theme */
241
  :root {
242
- --color-primary-green: var(--primary-900); /* #0FCB8C */
243
- --color-primary-pink: var(--secondary-900); /* #f0529c */
244
- --color-neutral-light: var(--neutral-200); /* #C9C9C3 */
245
- --color-background-light: var(--neutral-50); /* #FAF2E9 */
246
- --color-background-dark: var(--neutral-900); /* #032629 */
247
- --color-text-light: var(--neutral-50); /* #FAF2E9 */
 
 
248
  }
249
 
250
  /* This makes space for the huggingface header bar which must shown on HF spaces. */
@@ -350,13 +352,13 @@ table.gr-table {
350
  overflow: visible !important;
351
  }
352
  #pareto-disclaimer {
353
- color: #f0529c !important;
354
  }
355
  thead.svelte-1e98i6s th {
356
  background: white !important;
357
  }
358
  .dark thead.svelte-1e98i6s th {
359
- background: #091a1a !important;
360
  }
361
  .cell-wrap.svelte-v1pjjd {
362
  font-family: 'Manrope';
@@ -376,10 +378,10 @@ nav.svelte-ti537g.svelte-ti537g {
376
  position: relative !important;
377
  }
378
  .dark #leaderboard-accordion .label-wrap {
379
- color: #0FCB8C !important;
380
  }
381
  .dark block.svelte-1svsvh2 {
382
- background: #032629 !important;
383
  }
384
  .padding.svelte-phx28p {
385
  padding: 0 !important;
@@ -391,7 +393,7 @@ nav.svelte-ti537g.svelte-ti537g {
391
  gap: 10px !important;
392
  }
393
  .dark .primary-link-button {
394
- color: var(--color-primary-green);
395
  }
396
  .primary-link-button {
397
  background: none;
@@ -400,7 +402,7 @@ nav.svelte-ti537g.svelte-ti537g {
400
  margin: 0;
401
  font-family: inherit;
402
  font-size: 16px;
403
- color: var(--color-primary-pink);
404
  text-decoration: none;
405
  cursor: pointer;
406
  white-space: nowrap;
@@ -446,8 +448,8 @@ nav.svelte-ti537g.svelte-ti537g {
446
  content: attr(data-tooltip);
447
  position: absolute;
448
  bottom: 125%;
449
- background-color: #105257;
450
- color: #fff;
451
  padding: 10px;
452
  border-radius: 4px;
453
  font-size: 12px;
@@ -569,17 +571,17 @@ span.wrap[tabindex="0"][role="button"][data-editable="false"] {
569
  background-color: #1C3A3C;
570
  }
571
  .benchmark-main-subtitle{
572
- color: var(--color-primary-green);
573
  overflow: hidden;
574
  padding-top: 120px;
575
  }
576
  .benchmark-title{
577
- color: var(--color-primary-pink);
578
  margin-top: 50px;
579
  font-size: 20px;
580
  }
581
  .dark .benchmark-title{
582
- color: var(--color-primary-green);
583
  }
584
  .benchmark-description {
585
  margin: 20px 0;
@@ -692,7 +694,7 @@ span.wrap[tabindex="0"][role="button"][data-editable="false"] {
692
 
693
  #feedback-button {
694
  display: inline-block;
695
- background-color: #345d60;
696
  color: white;
697
  border: none;
698
  border-radius: 4px;
@@ -704,15 +706,15 @@ span.wrap[tabindex="0"][role="button"][data-editable="false"] {
704
  }
705
 
706
  #feedback-button:hover {
707
- background-color: #5d888b;
708
  transform: translateY(-2px);
709
  box-shadow: 0 6px 12px rgba(0,0,0,0.3);
710
  }
711
  .dark #main-header h2 {
712
- color: #0fcb8c;
713
  }
714
  #main-header h2 {
715
- color: #f0529c;
716
  }
717
 
718
  /* --- New HTML-Based Tooltip Styles --- */
@@ -732,8 +734,8 @@ span.wrap[tabindex="0"][role="button"][data-editable="false"] {
732
  /* Card appearance */
733
  position: fixed;
734
  z-index: 1000;
735
- background-color: #083c40;
736
- color: #e5e7eb;
737
  border-radius: 12px;
738
  padding: 15px;
739
  width: max-content;
 
30
  </ul>
31
 
32
  <p>
33
+ This view is designed for quick comparison of general-purpose coding agents. For more details on how we calculate scores and cost, please see the <a href="/about" style="color: #6366F1; text-decoration: underline;">About</a> Page.
34
  </p>
35
  """
36
  SCATTER_DISCLAIMER = """
 
237
 
238
 
239
  css = """
240
+ /* CSS Color Variables using All-Hands-AI color scheme */
241
  :root {
242
+ --color-primary-accent: #FFE165; /* Yellow accent from All-Hands-AI */
243
+ --color-primary-link: #6366F1; /* Indigo/purple links */
244
+ --color-neutral-light: #D4CABD; /* Tan placeholder */
245
+ --color-background-light: #F1EAE0; /* Beige/cream background */
246
+ --color-background-dark: #292A36; /* Dark gray */
247
+ --color-text-dark: #292A36; /* Dark gray text */
248
+ --color-text-light: #F1EAE0; /* Light text for dark backgrounds */
249
+ --color-button-hover: #4B5563; /* Darker gray for hover states */
250
  }
251
 
252
  /* This makes space for the huggingface header bar which must shown on HF spaces. */
 
352
  overflow: visible !important;
353
  }
354
  #pareto-disclaimer {
355
+ color: var(--color-primary-accent) !important;
356
  }
357
  thead.svelte-1e98i6s th {
358
  background: white !important;
359
  }
360
  .dark thead.svelte-1e98i6s th {
361
+ background: var(--color-background-dark) !important;
362
  }
363
  .cell-wrap.svelte-v1pjjd {
364
  font-family: 'Manrope';
 
378
  position: relative !important;
379
  }
380
  .dark #leaderboard-accordion .label-wrap {
381
+ color: var(--color-primary-accent) !important;
382
  }
383
  .dark block.svelte-1svsvh2 {
384
+ background: var(--color-background-dark) !important;
385
  }
386
  .padding.svelte-phx28p {
387
  padding: 0 !important;
 
393
  gap: 10px !important;
394
  }
395
  .dark .primary-link-button {
396
+ color: var(--color-primary-link);
397
  }
398
  .primary-link-button {
399
  background: none;
 
402
  margin: 0;
403
  font-family: inherit;
404
  font-size: 16px;
405
+ color: var(--color-primary-link);
406
  text-decoration: none;
407
  cursor: pointer;
408
  white-space: nowrap;
 
448
  content: attr(data-tooltip);
449
  position: absolute;
450
  bottom: 125%;
451
+ background-color: var(--color-background-dark);
452
+ color: var(--color-text-light);
453
  padding: 10px;
454
  border-radius: 4px;
455
  font-size: 12px;
 
571
  background-color: #1C3A3C;
572
  }
573
  .benchmark-main-subtitle{
574
+ color: var(--color-primary-link);
575
  overflow: hidden;
576
  padding-top: 120px;
577
  }
578
  .benchmark-title{
579
+ color: var(--color-primary-link);
580
  margin-top: 50px;
581
  font-size: 20px;
582
  }
583
  .dark .benchmark-title{
584
+ color: var(--color-primary-accent);
585
  }
586
  .benchmark-description {
587
  margin: 20px 0;
 
694
 
695
  #feedback-button {
696
  display: inline-block;
697
+ background-color: var(--color-primary-link);
698
  color: white;
699
  border: none;
700
  border-radius: 4px;
 
706
  }
707
 
708
  #feedback-button:hover {
709
+ background-color: var(--color-button-hover);
710
  transform: translateY(-2px);
711
  box-shadow: 0 6px 12px rgba(0,0,0,0.3);
712
  }
713
  .dark #main-header h2 {
714
+ color: var(--color-primary-accent);
715
  }
716
  #main-header h2 {
717
+ color: var(--color-primary-link);
718
  }
719
 
720
  /* --- New HTML-Based Tooltip Styles --- */
 
734
  /* Card appearance */
735
  position: fixed;
736
  z-index: 1000;
737
+ background-color: var(--color-background-dark);
738
+ color: var(--color-text-light);
739
  border-radius: 12px;
740
  padding: 15px;
741
  width: max-content;
data/extracted/agenteval.json ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "suite_config": {
3
+ "name": "openhands-index",
4
+ "version": "1.0.0-dev1",
5
+ "splits": [
6
+ {
7
+ "name": "swe-bench",
8
+ "tasks": [
9
+ {
10
+ "name": "swe-bench",
11
+ "tags": [
12
+ "swe-bench"
13
+ ]
14
+ }
15
+ ]
16
+ },
17
+ {
18
+ "name": "multi-swe-bench",
19
+ "tasks": [
20
+ {
21
+ "name": "multi-swe-bench",
22
+ "tags": [
23
+ "multi-swe-bench"
24
+ ]
25
+ }
26
+ ]
27
+ },
28
+ {
29
+ "name": "swe-bench-multimodal",
30
+ "tasks": [
31
+ {
32
+ "name": "swe-bench-multimodal",
33
+ "tags": [
34
+ "swe-bench-multimodal"
35
+ ]
36
+ }
37
+ ]
38
+ },
39
+ {
40
+ "name": "swt-bench",
41
+ "tasks": [
42
+ {
43
+ "name": "swt-bench",
44
+ "tags": [
45
+ "swt-bench"
46
+ ]
47
+ }
48
+ ]
49
+ },
50
+ {
51
+ "name": "commit0",
52
+ "tasks": [
53
+ {
54
+ "name": "commit0",
55
+ "tags": [
56
+ "commit0"
57
+ ]
58
+ }
59
+ ]
60
+ },
61
+ {
62
+ "name": "gaia",
63
+ "tasks": [
64
+ {
65
+ "name": "gaia",
66
+ "tags": [
67
+ "gaia"
68
+ ]
69
+ }
70
+ ]
71
+ }
72
+ ]
73
+ }
74
+ }
data/extracted/commit0.jsonl ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {"agent_name": "1.0.2", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 71.2, "metric": "test_pass_rate", "submission_time": "2025-11-24T23:38:02.231539", "tags": ["commit0"], "total_cost": 45.6, "total_runtime": 656.0}
2
+ {"agent_name": "1.0.1", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 68.9, "metric": "test_pass_rate", "submission_time": "2025-11-24T23:38:02.231556", "tags": ["commit0"], "total_cost": 44.45, "total_runtime": 644.5}
3
+ {"agent_name": "1.0.0", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 61.5, "metric": "test_pass_rate", "submission_time": "2025-11-24T23:38:02.231563", "tags": ["commit0"], "total_cost": 40.75, "total_runtime": 607.5}
4
+ {"agent_name": "0.9.5", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 55.3, "metric": "test_pass_rate", "submission_time": "2025-11-24T23:38:02.231569", "tags": ["commit0"], "total_cost": 37.65, "total_runtime": 576.5}
5
+ {"agent_name": "0.9.0", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 52.1, "metric": "test_pass_rate", "submission_time": "2025-11-24T23:38:02.231574", "tags": ["commit0"], "total_cost": 36.05, "total_runtime": 560.5}
data/extracted/gaia.jsonl ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {"agent_name": "1.0.2", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 58.7, "metric": "accuracy", "submission_time": "2025-11-24T23:38:02.231711", "tags": ["gaia"], "total_cost": 39.35, "total_runtime": 593.5}
2
+ {"agent_name": "1.0.1", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 55.2, "metric": "accuracy", "submission_time": "2025-11-24T23:38:02.231728", "tags": ["gaia"], "total_cost": 37.6, "total_runtime": 576.0}
3
+ {"agent_name": "1.0.0", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 48.3, "metric": "accuracy", "submission_time": "2025-11-24T23:38:02.231735", "tags": ["gaia"], "total_cost": 34.15, "total_runtime": 541.5}
4
+ {"agent_name": "0.9.5", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 42.1, "metric": "accuracy", "submission_time": "2025-11-24T23:38:02.231741", "tags": ["gaia"], "total_cost": 31.05, "total_runtime": 510.5}
5
+ {"agent_name": "0.9.0", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 39.4, "metric": "accuracy", "submission_time": "2025-11-24T23:38:02.231749", "tags": ["gaia"], "total_cost": 29.7, "total_runtime": 497.0}
data/extracted/multi-swe-bench.jsonl ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {"agent_name": "1.0.2", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 35.2, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.230869", "tags": ["multi-swe-bench"], "total_cost": 27.6, "total_runtime": 476.0}
2
+ {"agent_name": "1.0.1", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 32.8, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.230889", "tags": ["multi-swe-bench"], "total_cost": 26.4, "total_runtime": 464.0}
3
+ {"agent_name": "1.0.0", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 28.4, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.230899", "tags": ["multi-swe-bench"], "total_cost": 24.2, "total_runtime": 442.0}
4
+ {"agent_name": "0.9.5", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 24.1, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.230908", "tags": ["multi-swe-bench"], "total_cost": 22.05, "total_runtime": 420.5}
5
+ {"agent_name": "0.9.0", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 21.5, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.230917", "tags": ["multi-swe-bench"], "total_cost": 20.75, "total_runtime": 407.5}
data/extracted/swe-bench-multimodal.jsonl ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {"agent_name": "1.0.2", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 42.1, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.231057", "tags": ["swe-bench-multimodal"], "total_cost": 31.05, "total_runtime": 510.5}
2
+ {"agent_name": "1.0.1", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 39.5, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.231073", "tags": ["swe-bench-multimodal"], "total_cost": 29.75, "total_runtime": 497.5}
3
+ {"agent_name": "1.0.0", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 34.2, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.231082", "tags": ["swe-bench-multimodal"], "total_cost": 27.1, "total_runtime": 471.0}
4
+ {"agent_name": "0.9.5", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 28.9, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.231088", "tags": ["swe-bench-multimodal"], "total_cost": 24.45, "total_runtime": 444.5}
5
+ {"agent_name": "0.9.0", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 25.7, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.231093", "tags": ["swe-bench-multimodal"], "total_cost": 22.85, "total_runtime": 428.5}
data/extracted/swe-bench.jsonl ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {"agent_name": "1.0.2", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 48.3, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.230638", "tags": ["swe-bench"], "total_cost": 34.15, "total_runtime": 541.5}
2
+ {"agent_name": "1.0.1", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 45.1, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.230668", "tags": ["swe-bench"], "total_cost": 32.55, "total_runtime": 525.5}
3
+ {"agent_name": "1.0.0", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 38.7, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.230681", "tags": ["swe-bench"], "total_cost": 29.35, "total_runtime": 493.5}
4
+ {"agent_name": "0.9.5", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 32.5, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.230689", "tags": ["swe-bench"], "total_cost": 26.25, "total_runtime": 462.5}
5
+ {"agent_name": "0.9.0", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 29.8, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.230696", "tags": ["swe-bench"], "total_cost": 24.9, "total_runtime": 449.0}
data/extracted/swt-bench.jsonl ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {"agent_name": "1.0.2", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 65.4, "metric": "success_rate", "submission_time": "2025-11-24T23:38:02.231319", "tags": ["swt-bench"], "total_cost": 42.7, "total_runtime": 627.0}
2
+ {"agent_name": "1.0.1", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 62.3, "metric": "success_rate", "submission_time": "2025-11-24T23:38:02.231344", "tags": ["swt-bench"], "total_cost": 41.15, "total_runtime": 611.5}
3
+ {"agent_name": "1.0.0", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 54.1, "metric": "success_rate", "submission_time": "2025-11-24T23:38:02.231356", "tags": ["swt-bench"], "total_cost": 37.05, "total_runtime": 570.5}
4
+ {"agent_name": "0.9.5", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 47.8, "metric": "success_rate", "submission_time": "2025-11-24T23:38:02.231365", "tags": ["swt-bench"], "total_cost": 33.9, "total_runtime": 539.0}
5
+ {"agent_name": "0.9.0", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 44.2, "metric": "success_rate", "submission_time": "2025-11-24T23:38:02.231373", "tags": ["swt-bench"], "total_cost": 32.1, "total_runtime": 521.0}
data/extracted/test.jsonl ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 48.3, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.092865", "tags": ["swe-bench"], "total_cost": 34.15, "total_runtime": 541.5}
2
+ {"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 45.1, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.092895", "tags": ["swe-bench"], "total_cost": 32.55, "total_runtime": 525.5}
3
+ {"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 38.7, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.092908", "tags": ["swe-bench"], "total_cost": 29.35, "total_runtime": 493.5}
4
+ {"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 32.5, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.092916", "tags": ["swe-bench"], "total_cost": 26.25, "total_runtime": 462.5}
5
+ {"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 29.8, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.092922", "tags": ["swe-bench"], "total_cost": 24.9, "total_runtime": 449.0}
6
+ {"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 35.2, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093026", "tags": ["multi-swe-bench"], "total_cost": 27.6, "total_runtime": 476.0}
7
+ {"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 32.8, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093040", "tags": ["multi-swe-bench"], "total_cost": 26.4, "total_runtime": 464.0}
8
+ {"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 28.4, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093048", "tags": ["multi-swe-bench"], "total_cost": 24.2, "total_runtime": 442.0}
9
+ {"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 24.1, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093058", "tags": ["multi-swe-bench"], "total_cost": 22.05, "total_runtime": 420.5}
10
+ {"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 21.5, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093067", "tags": ["multi-swe-bench"], "total_cost": 20.75, "total_runtime": 407.5}
11
+ {"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 42.1, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093172", "tags": ["swe-bench-multimodal"], "total_cost": 31.05, "total_runtime": 510.5}
12
+ {"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 39.5, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093187", "tags": ["swe-bench-multimodal"], "total_cost": 29.75, "total_runtime": 497.5}
13
+ {"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 34.2, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093200", "tags": ["swe-bench-multimodal"], "total_cost": 27.1, "total_runtime": 471.0}
14
+ {"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 28.9, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093208", "tags": ["swe-bench-multimodal"], "total_cost": 24.45, "total_runtime": 444.5}
15
+ {"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 25.7, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093218", "tags": ["swe-bench-multimodal"], "total_cost": 22.85, "total_runtime": 428.5}
16
+ {"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 65.4, "metric": "success_rate", "submission_time": "2025-11-24T19:56:00.093304", "tags": ["swt-bench"], "total_cost": 42.7, "total_runtime": 627.0}
17
+ {"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 62.3, "metric": "success_rate", "submission_time": "2025-11-24T19:56:00.093317", "tags": ["swt-bench"], "total_cost": 41.15, "total_runtime": 611.5}
18
+ {"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 54.1, "metric": "success_rate", "submission_time": "2025-11-24T19:56:00.093326", "tags": ["swt-bench"], "total_cost": 37.05, "total_runtime": 570.5}
19
+ {"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 47.8, "metric": "success_rate", "submission_time": "2025-11-24T19:56:00.093333", "tags": ["swt-bench"], "total_cost": 33.9, "total_runtime": 539.0}
20
+ {"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 44.2, "metric": "success_rate", "submission_time": "2025-11-24T19:56:00.093338", "tags": ["swt-bench"], "total_cost": 32.1, "total_runtime": 521.0}
21
+ {"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 71.2, "metric": "test_pass_rate", "submission_time": "2025-11-24T19:56:00.093416", "tags": ["commit0"], "total_cost": 45.6, "total_runtime": 656.0}
22
+ {"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 68.9, "metric": "test_pass_rate", "submission_time": "2025-11-24T19:56:00.093428", "tags": ["commit0"], "total_cost": 44.45, "total_runtime": 644.5}
23
+ {"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 61.5, "metric": "test_pass_rate", "submission_time": "2025-11-24T19:56:00.093436", "tags": ["commit0"], "total_cost": 40.75, "total_runtime": 607.5}
24
+ {"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 55.3, "metric": "test_pass_rate", "submission_time": "2025-11-24T19:56:00.093445", "tags": ["commit0"], "total_cost": 37.65, "total_runtime": 576.5}
25
+ {"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 52.1, "metric": "test_pass_rate", "submission_time": "2025-11-24T19:56:00.093450", "tags": ["commit0"], "total_cost": 36.05, "total_runtime": 560.5}
26
+ {"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 58.7, "metric": "accuracy", "submission_time": "2025-11-24T19:56:00.093541", "tags": ["gaia"], "total_cost": 39.35, "total_runtime": 593.5}
27
+ {"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 55.2, "metric": "accuracy", "submission_time": "2025-11-24T19:56:00.093556", "tags": ["gaia"], "total_cost": 37.6, "total_runtime": 576.0}
28
+ {"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 48.3, "metric": "accuracy", "submission_time": "2025-11-24T19:56:00.093563", "tags": ["gaia"], "total_cost": 34.15, "total_runtime": 541.5}
29
+ {"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 42.1, "metric": "accuracy", "submission_time": "2025-11-24T19:56:00.093568", "tags": ["gaia"], "total_cost": 31.05, "total_runtime": 510.5}
30
+ {"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 39.4, "metric": "accuracy", "submission_time": "2025-11-24T19:56:00.093574", "tags": ["gaia"], "total_cost": 29.7, "total_runtime": 497.0}
data/extracted/test.parquet ADDED
Binary file (9.26 kB). View file
 
data/extracted/validation.jsonl ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 48.3, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.092865", "tags": ["swe-bench"], "total_cost": 34.15, "total_runtime": 541.5}
2
+ {"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 45.1, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.092895", "tags": ["swe-bench"], "total_cost": 32.55, "total_runtime": 525.5}
3
+ {"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 38.7, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.092908", "tags": ["swe-bench"], "total_cost": 29.35, "total_runtime": 493.5}
4
+ {"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 32.5, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.092916", "tags": ["swe-bench"], "total_cost": 26.25, "total_runtime": 462.5}
5
+ {"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 29.8, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.092922", "tags": ["swe-bench"], "total_cost": 24.9, "total_runtime": 449.0}
6
+ {"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 35.2, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093026", "tags": ["multi-swe-bench"], "total_cost": 27.6, "total_runtime": 476.0}
7
+ {"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 32.8, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093040", "tags": ["multi-swe-bench"], "total_cost": 26.4, "total_runtime": 464.0}
8
+ {"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 28.4, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093048", "tags": ["multi-swe-bench"], "total_cost": 24.2, "total_runtime": 442.0}
9
+ {"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 24.1, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093058", "tags": ["multi-swe-bench"], "total_cost": 22.05, "total_runtime": 420.5}
10
+ {"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 21.5, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093067", "tags": ["multi-swe-bench"], "total_cost": 20.75, "total_runtime": 407.5}
11
+ {"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 42.1, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093172", "tags": ["swe-bench-multimodal"], "total_cost": 31.05, "total_runtime": 510.5}
12
+ {"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 39.5, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093187", "tags": ["swe-bench-multimodal"], "total_cost": 29.75, "total_runtime": 497.5}
13
+ {"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 34.2, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093200", "tags": ["swe-bench-multimodal"], "total_cost": 27.1, "total_runtime": 471.0}
14
+ {"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 28.9, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093208", "tags": ["swe-bench-multimodal"], "total_cost": 24.45, "total_runtime": 444.5}
15
+ {"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 25.7, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093218", "tags": ["swe-bench-multimodal"], "total_cost": 22.85, "total_runtime": 428.5}
16
+ {"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 65.4, "metric": "success_rate", "submission_time": "2025-11-24T19:56:00.093304", "tags": ["swt-bench"], "total_cost": 42.7, "total_runtime": 627.0}
17
+ {"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 62.3, "metric": "success_rate", "submission_time": "2025-11-24T19:56:00.093317", "tags": ["swt-bench"], "total_cost": 41.15, "total_runtime": 611.5}
18
+ {"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 54.1, "metric": "success_rate", "submission_time": "2025-11-24T19:56:00.093326", "tags": ["swt-bench"], "total_cost": 37.05, "total_runtime": 570.5}
19
+ {"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 47.8, "metric": "success_rate", "submission_time": "2025-11-24T19:56:00.093333", "tags": ["swt-bench"], "total_cost": 33.9, "total_runtime": 539.0}
20
+ {"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 44.2, "metric": "success_rate", "submission_time": "2025-11-24T19:56:00.093338", "tags": ["swt-bench"], "total_cost": 32.1, "total_runtime": 521.0}
21
+ {"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 71.2, "metric": "test_pass_rate", "submission_time": "2025-11-24T19:56:00.093416", "tags": ["commit0"], "total_cost": 45.6, "total_runtime": 656.0}
22
+ {"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 68.9, "metric": "test_pass_rate", "submission_time": "2025-11-24T19:56:00.093428", "tags": ["commit0"], "total_cost": 44.45, "total_runtime": 644.5}
23
+ {"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 61.5, "metric": "test_pass_rate", "submission_time": "2025-11-24T19:56:00.093436", "tags": ["commit0"], "total_cost": 40.75, "total_runtime": 607.5}
24
+ {"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 55.3, "metric": "test_pass_rate", "submission_time": "2025-11-24T19:56:00.093445", "tags": ["commit0"], "total_cost": 37.65, "total_runtime": 576.5}
25
+ {"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 52.1, "metric": "test_pass_rate", "submission_time": "2025-11-24T19:56:00.093450", "tags": ["commit0"], "total_cost": 36.05, "total_runtime": 560.5}
26
+ {"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 58.7, "metric": "accuracy", "submission_time": "2025-11-24T19:56:00.093541", "tags": ["gaia"], "total_cost": 39.35, "total_runtime": 593.5}
27
+ {"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 55.2, "metric": "accuracy", "submission_time": "2025-11-24T19:56:00.093556", "tags": ["gaia"], "total_cost": 37.6, "total_runtime": 576.0}
28
+ {"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 48.3, "metric": "accuracy", "submission_time": "2025-11-24T19:56:00.093563", "tags": ["gaia"], "total_cost": 34.15, "total_runtime": 541.5}
29
+ {"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 42.1, "metric": "accuracy", "submission_time": "2025-11-24T19:56:00.093568", "tags": ["gaia"], "total_cost": 31.05, "total_runtime": 510.5}
30
+ {"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 39.4, "metric": "accuracy", "submission_time": "2025-11-24T19:56:00.093574", "tags": ["gaia"], "total_cost": 29.7, "total_runtime": 497.0}
data/extracted/validation.parquet ADDED
Binary file (9.29 kB). View file
 
generate_mock_jsonl.py CHANGED
@@ -41,7 +41,7 @@ BENCHMARKS = {
41
  # Mock agents with realistic scores
42
  MOCK_AGENTS = [
43
  {
44
- "agent_name": "OpenHands CodeAct v2.1",
45
  "llm_base": "claude-3-5-sonnet-20241022",
46
  "openness": "closed_api_available",
47
  "tool_usage": "standard",
@@ -55,7 +55,7 @@ MOCK_AGENTS = [
55
  }
56
  },
57
  {
58
- "agent_name": "OpenHands CodeAct v2.0",
59
  "llm_base": "gpt-4o-2024-11-20",
60
  "openness": "closed_api_available",
61
  "tool_usage": "standard",
@@ -69,7 +69,7 @@ MOCK_AGENTS = [
69
  }
70
  },
71
  {
72
- "agent_name": "AutoCodeRover",
73
  "llm_base": "gpt-4-turbo-2024-04-09",
74
  "openness": "closed_api_available",
75
  "tool_usage": "standard",
@@ -83,7 +83,7 @@ MOCK_AGENTS = [
83
  }
84
  },
85
  {
86
- "agent_name": "Agentless",
87
  "llm_base": "gpt-4o-mini-2024-07-18",
88
  "openness": "closed_api_available",
89
  "tool_usage": "standard",
@@ -97,7 +97,7 @@ MOCK_AGENTS = [
97
  }
98
  },
99
  {
100
- "agent_name": "SWE-Agent",
101
  "llm_base": "claude-3-opus-20240229",
102
  "openness": "closed_api_available",
103
  "tool_usage": "custom_interface",
 
41
  # Mock agents with realistic scores
42
  MOCK_AGENTS = [
43
  {
44
+ "agent_name": "1.0.2",
45
  "llm_base": "claude-3-5-sonnet-20241022",
46
  "openness": "closed_api_available",
47
  "tool_usage": "standard",
 
55
  }
56
  },
57
  {
58
+ "agent_name": "1.0.1",
59
  "llm_base": "gpt-4o-2024-11-20",
60
  "openness": "closed_api_available",
61
  "tool_usage": "standard",
 
69
  }
70
  },
71
  {
72
+ "agent_name": "1.0.0",
73
  "llm_base": "gpt-4-turbo-2024-04-09",
74
  "openness": "closed_api_available",
75
  "tool_usage": "standard",
 
83
  }
84
  },
85
  {
86
+ "agent_name": "0.9.5",
87
  "llm_base": "gpt-4o-mini-2024-07-18",
88
  "openness": "closed_api_available",
89
  "tool_usage": "standard",
 
97
  }
98
  },
99
  {
100
+ "agent_name": "0.9.0",
101
  "llm_base": "claude-3-opus-20240229",
102
  "openness": "closed_api_available",
103
  "tool_usage": "custom_interface",
leaderboard_transformer.py CHANGED
@@ -103,16 +103,15 @@ def _pretty_column_name(raw_col: str) -> str:
103
  # Case 1: Handle fixed, special-case mappings first.
104
  fixed_mappings = {
105
  'id': 'id',
106
- 'Agent': 'Agent',
107
  'Agent description': 'Agent Description',
108
- 'User/organization': 'Submitter',
109
  'Submission date': 'Date',
110
  'Overall': 'Overall Score',
111
  'Overall cost': 'Overall Cost',
112
  'Logs': 'Logs',
113
  'Openness': 'Openness',
114
  'Agent tooling': 'Agent Tooling',
115
- 'LLM base': 'Models Used',
116
  'Source': 'Source',
117
  }
118
 
 
103
  # Case 1: Handle fixed, special-case mappings first.
104
  fixed_mappings = {
105
  'id': 'id',
106
+ 'Agent': 'Agent Version',
107
  'Agent description': 'Agent Description',
 
108
  'Submission date': 'Date',
109
  'Overall': 'Overall Score',
110
  'Overall cost': 'Overall Cost',
111
  'Logs': 'Logs',
112
  'Openness': 'Openness',
113
  'Agent tooling': 'Agent Tooling',
114
+ 'LLM base': 'Model',
115
  'Source': 'Source',
116
  }
117
 
mock_results/1.0.0-dev1/agenteval.json CHANGED
@@ -4,106 +4,66 @@
4
  "version": "1.0.0-dev1",
5
  "splits": [
6
  {
7
- "name": "test",
8
  "tasks": [
9
  {
10
  "name": "swe-bench",
11
  "tags": [
12
- "Overall",
13
- "Bug Fixing",
14
  "swe-bench"
15
  ]
16
- },
17
- {
18
- "name": "swe-bench-multimodal",
19
- "tags": [
20
- "Overall",
21
- "Bug Fixing",
22
- "swe-bench-multimodal"
23
- ]
24
- },
25
- {
26
- "name": "commit0",
27
- "tags": [
28
- "Overall",
29
- "App Creation",
30
- "commit0"
31
- ]
32
- },
33
  {
34
  "name": "multi-swe-bench",
35
  "tags": [
36
- "Overall",
37
- "Frontend Development",
38
  "multi-swe-bench"
39
  ]
40
- },
41
- {
42
- "name": "swt-bench",
43
- "tags": [
44
- "Overall",
45
- "Test Generation",
46
- "swt-bench"
47
- ]
48
- },
49
- {
50
- "name": "gaia",
51
- "tags": [
52
- "Overall",
53
- "Information Gathering",
54
- "gaia"
55
- ]
56
  }
57
  ]
58
  },
59
  {
60
- "name": "validation",
61
  "tasks": [
62
- {
63
- "name": "swe-bench",
64
- "tags": [
65
- "Overall",
66
- "Bug Fixing",
67
- "swe-bench"
68
- ]
69
- },
70
  {
71
  "name": "swe-bench-multimodal",
72
  "tags": [
73
- "Overall",
74
- "Bug Fixing",
75
  "swe-bench-multimodal"
76
  ]
77
- },
78
- {
79
- "name": "commit0",
80
- "tags": [
81
- "Overall",
82
- "App Creation",
83
- "commit0"
84
- ]
85
- },
86
  {
87
- "name": "multi-swe-bench",
88
  "tags": [
89
- "Overall",
90
- "Frontend Development",
91
- "multi-swe-bench"
92
  ]
93
- },
 
 
 
 
 
94
  {
95
- "name": "swt-bench",
96
  "tags": [
97
- "Overall",
98
- "Test Generation",
99
- "swt-bench"
100
  ]
101
- },
 
 
 
 
 
102
  {
103
  "name": "gaia",
104
  "tags": [
105
- "Overall",
106
- "Information Gathering",
107
  "gaia"
108
  ]
109
  }
 
4
  "version": "1.0.0-dev1",
5
  "splits": [
6
  {
7
+ "name": "swe-bench",
8
  "tasks": [
9
  {
10
  "name": "swe-bench",
11
  "tags": [
 
 
12
  "swe-bench"
13
  ]
14
+ }
15
+ ]
16
+ },
17
+ {
18
+ "name": "multi-swe-bench",
19
+ "tasks": [
 
 
 
 
 
 
 
 
 
 
 
20
  {
21
  "name": "multi-swe-bench",
22
  "tags": [
 
 
23
  "multi-swe-bench"
24
  ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  }
26
  ]
27
  },
28
  {
29
+ "name": "swe-bench-multimodal",
30
  "tasks": [
 
 
 
 
 
 
 
 
31
  {
32
  "name": "swe-bench-multimodal",
33
  "tags": [
 
 
34
  "swe-bench-multimodal"
35
  ]
36
+ }
37
+ ]
38
+ },
39
+ {
40
+ "name": "swt-bench",
41
+ "tasks": [
 
 
 
42
  {
43
+ "name": "swt-bench",
44
  "tags": [
45
+ "swt-bench"
 
 
46
  ]
47
+ }
48
+ ]
49
+ },
50
+ {
51
+ "name": "commit0",
52
+ "tasks": [
53
  {
54
+ "name": "commit0",
55
  "tags": [
56
+ "commit0"
 
 
57
  ]
58
+ }
59
+ ]
60
+ },
61
+ {
62
+ "name": "gaia",
63
+ "tasks": [
64
  {
65
  "name": "gaia",
66
  "tags": [
 
 
67
  "gaia"
68
  ]
69
  }
mock_results/1.0.0-dev1/commit0.jsonl CHANGED
@@ -1,5 +1,5 @@
1
- {"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 71.2, "metric": "test_pass_rate", "submission_time": "2025-11-24T19:56:00.093416", "tags": ["commit0"], "total_cost": 45.6, "total_runtime": 656.0}
2
- {"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 68.9, "metric": "test_pass_rate", "submission_time": "2025-11-24T19:56:00.093428", "tags": ["commit0"], "total_cost": 44.45, "total_runtime": 644.5}
3
- {"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 61.5, "metric": "test_pass_rate", "submission_time": "2025-11-24T19:56:00.093436", "tags": ["commit0"], "total_cost": 40.75, "total_runtime": 607.5}
4
- {"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 55.3, "metric": "test_pass_rate", "submission_time": "2025-11-24T19:56:00.093445", "tags": ["commit0"], "total_cost": 37.65, "total_runtime": 576.5}
5
- {"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 52.1, "metric": "test_pass_rate", "submission_time": "2025-11-24T19:56:00.093450", "tags": ["commit0"], "total_cost": 36.05, "total_runtime": 560.5}
 
1
+ {"agent_name": "1.0.2", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 71.2, "metric": "test_pass_rate", "submission_time": "2025-11-24T23:38:02.231539", "tags": ["commit0"], "total_cost": 45.6, "total_runtime": 656.0}
2
+ {"agent_name": "1.0.1", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 68.9, "metric": "test_pass_rate", "submission_time": "2025-11-24T23:38:02.231556", "tags": ["commit0"], "total_cost": 44.45, "total_runtime": 644.5}
3
+ {"agent_name": "1.0.0", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 61.5, "metric": "test_pass_rate", "submission_time": "2025-11-24T23:38:02.231563", "tags": ["commit0"], "total_cost": 40.75, "total_runtime": 607.5}
4
+ {"agent_name": "0.9.5", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 55.3, "metric": "test_pass_rate", "submission_time": "2025-11-24T23:38:02.231569", "tags": ["commit0"], "total_cost": 37.65, "total_runtime": 576.5}
5
+ {"agent_name": "0.9.0", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 52.1, "metric": "test_pass_rate", "submission_time": "2025-11-24T23:38:02.231574", "tags": ["commit0"], "total_cost": 36.05, "total_runtime": 560.5}
mock_results/1.0.0-dev1/gaia.jsonl CHANGED
@@ -1,5 +1,5 @@
1
- {"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 58.7, "metric": "accuracy", "submission_time": "2025-11-24T19:56:00.093541", "tags": ["gaia"], "total_cost": 39.35, "total_runtime": 593.5}
2
- {"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 55.2, "metric": "accuracy", "submission_time": "2025-11-24T19:56:00.093556", "tags": ["gaia"], "total_cost": 37.6, "total_runtime": 576.0}
3
- {"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 48.3, "metric": "accuracy", "submission_time": "2025-11-24T19:56:00.093563", "tags": ["gaia"], "total_cost": 34.15, "total_runtime": 541.5}
4
- {"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 42.1, "metric": "accuracy", "submission_time": "2025-11-24T19:56:00.093568", "tags": ["gaia"], "total_cost": 31.05, "total_runtime": 510.5}
5
- {"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 39.4, "metric": "accuracy", "submission_time": "2025-11-24T19:56:00.093574", "tags": ["gaia"], "total_cost": 29.7, "total_runtime": 497.0}
 
1
+ {"agent_name": "1.0.2", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 58.7, "metric": "accuracy", "submission_time": "2025-11-24T23:38:02.231711", "tags": ["gaia"], "total_cost": 39.35, "total_runtime": 593.5}
2
+ {"agent_name": "1.0.1", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 55.2, "metric": "accuracy", "submission_time": "2025-11-24T23:38:02.231728", "tags": ["gaia"], "total_cost": 37.6, "total_runtime": 576.0}
3
+ {"agent_name": "1.0.0", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 48.3, "metric": "accuracy", "submission_time": "2025-11-24T23:38:02.231735", "tags": ["gaia"], "total_cost": 34.15, "total_runtime": 541.5}
4
+ {"agent_name": "0.9.5", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 42.1, "metric": "accuracy", "submission_time": "2025-11-24T23:38:02.231741", "tags": ["gaia"], "total_cost": 31.05, "total_runtime": 510.5}
5
+ {"agent_name": "0.9.0", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 39.4, "metric": "accuracy", "submission_time": "2025-11-24T23:38:02.231749", "tags": ["gaia"], "total_cost": 29.7, "total_runtime": 497.0}
mock_results/1.0.0-dev1/multi-swe-bench.jsonl CHANGED
@@ -1,5 +1,5 @@
1
- {"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 35.2, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093026", "tags": ["multi-swe-bench"], "total_cost": 27.6, "total_runtime": 476.0}
2
- {"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 32.8, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093040", "tags": ["multi-swe-bench"], "total_cost": 26.4, "total_runtime": 464.0}
3
- {"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 28.4, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093048", "tags": ["multi-swe-bench"], "total_cost": 24.2, "total_runtime": 442.0}
4
- {"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 24.1, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093058", "tags": ["multi-swe-bench"], "total_cost": 22.05, "total_runtime": 420.5}
5
- {"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 21.5, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093067", "tags": ["multi-swe-bench"], "total_cost": 20.75, "total_runtime": 407.5}
 
1
+ {"agent_name": "1.0.2", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 35.2, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.230869", "tags": ["multi-swe-bench"], "total_cost": 27.6, "total_runtime": 476.0}
2
+ {"agent_name": "1.0.1", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 32.8, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.230889", "tags": ["multi-swe-bench"], "total_cost": 26.4, "total_runtime": 464.0}
3
+ {"agent_name": "1.0.0", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 28.4, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.230899", "tags": ["multi-swe-bench"], "total_cost": 24.2, "total_runtime": 442.0}
4
+ {"agent_name": "0.9.5", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 24.1, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.230908", "tags": ["multi-swe-bench"], "total_cost": 22.05, "total_runtime": 420.5}
5
+ {"agent_name": "0.9.0", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 21.5, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.230917", "tags": ["multi-swe-bench"], "total_cost": 20.75, "total_runtime": 407.5}
mock_results/1.0.0-dev1/swe-bench-multimodal.jsonl CHANGED
@@ -1,5 +1,5 @@
1
- {"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 42.1, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093172", "tags": ["swe-bench-multimodal"], "total_cost": 31.05, "total_runtime": 510.5}
2
- {"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 39.5, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093187", "tags": ["swe-bench-multimodal"], "total_cost": 29.75, "total_runtime": 497.5}
3
- {"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 34.2, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093200", "tags": ["swe-bench-multimodal"], "total_cost": 27.1, "total_runtime": 471.0}
4
- {"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 28.9, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093208", "tags": ["swe-bench-multimodal"], "total_cost": 24.45, "total_runtime": 444.5}
5
- {"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 25.7, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093218", "tags": ["swe-bench-multimodal"], "total_cost": 22.85, "total_runtime": 428.5}
 
1
+ {"agent_name": "1.0.2", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 42.1, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.231057", "tags": ["swe-bench-multimodal"], "total_cost": 31.05, "total_runtime": 510.5}
2
+ {"agent_name": "1.0.1", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 39.5, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.231073", "tags": ["swe-bench-multimodal"], "total_cost": 29.75, "total_runtime": 497.5}
3
+ {"agent_name": "1.0.0", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 34.2, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.231082", "tags": ["swe-bench-multimodal"], "total_cost": 27.1, "total_runtime": 471.0}
4
+ {"agent_name": "0.9.5", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 28.9, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.231088", "tags": ["swe-bench-multimodal"], "total_cost": 24.45, "total_runtime": 444.5}
5
+ {"agent_name": "0.9.0", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 25.7, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.231093", "tags": ["swe-bench-multimodal"], "total_cost": 22.85, "total_runtime": 428.5}
mock_results/1.0.0-dev1/swe-bench.jsonl CHANGED
@@ -1,5 +1,5 @@
1
- {"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 48.3, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.092865", "tags": ["swe-bench"], "total_cost": 34.15, "total_runtime": 541.5}
2
- {"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 45.1, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.092895", "tags": ["swe-bench"], "total_cost": 32.55, "total_runtime": 525.5}
3
- {"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 38.7, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.092908", "tags": ["swe-bench"], "total_cost": 29.35, "total_runtime": 493.5}
4
- {"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 32.5, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.092916", "tags": ["swe-bench"], "total_cost": 26.25, "total_runtime": 462.5}
5
- {"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 29.8, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.092922", "tags": ["swe-bench"], "total_cost": 24.9, "total_runtime": 449.0}
 
1
+ {"agent_name": "1.0.2", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 48.3, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.230638", "tags": ["swe-bench"], "total_cost": 34.15, "total_runtime": 541.5}
2
+ {"agent_name": "1.0.1", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 45.1, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.230668", "tags": ["swe-bench"], "total_cost": 32.55, "total_runtime": 525.5}
3
+ {"agent_name": "1.0.0", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 38.7, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.230681", "tags": ["swe-bench"], "total_cost": 29.35, "total_runtime": 493.5}
4
+ {"agent_name": "0.9.5", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 32.5, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.230689", "tags": ["swe-bench"], "total_cost": 26.25, "total_runtime": 462.5}
5
+ {"agent_name": "0.9.0", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 29.8, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.230696", "tags": ["swe-bench"], "total_cost": 24.9, "total_runtime": 449.0}
mock_results/1.0.0-dev1/swt-bench.jsonl CHANGED
@@ -1,5 +1,5 @@
1
- {"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 65.4, "metric": "success_rate", "submission_time": "2025-11-24T19:56:00.093304", "tags": ["swt-bench"], "total_cost": 42.7, "total_runtime": 627.0}
2
- {"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 62.3, "metric": "success_rate", "submission_time": "2025-11-24T19:56:00.093317", "tags": ["swt-bench"], "total_cost": 41.15, "total_runtime": 611.5}
3
- {"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 54.1, "metric": "success_rate", "submission_time": "2025-11-24T19:56:00.093326", "tags": ["swt-bench"], "total_cost": 37.05, "total_runtime": 570.5}
4
- {"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 47.8, "metric": "success_rate", "submission_time": "2025-11-24T19:56:00.093333", "tags": ["swt-bench"], "total_cost": 33.9, "total_runtime": 539.0}
5
- {"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 44.2, "metric": "success_rate", "submission_time": "2025-11-24T19:56:00.093338", "tags": ["swt-bench"], "total_cost": 32.1, "total_runtime": 521.0}
 
1
+ {"agent_name": "1.0.2", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 65.4, "metric": "success_rate", "submission_time": "2025-11-24T23:38:02.231319", "tags": ["swt-bench"], "total_cost": 42.7, "total_runtime": 627.0}
2
+ {"agent_name": "1.0.1", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 62.3, "metric": "success_rate", "submission_time": "2025-11-24T23:38:02.231344", "tags": ["swt-bench"], "total_cost": 41.15, "total_runtime": 611.5}
3
+ {"agent_name": "1.0.0", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 54.1, "metric": "success_rate", "submission_time": "2025-11-24T23:38:02.231356", "tags": ["swt-bench"], "total_cost": 37.05, "total_runtime": 570.5}
4
+ {"agent_name": "0.9.5", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 47.8, "metric": "success_rate", "submission_time": "2025-11-24T23:38:02.231365", "tags": ["swt-bench"], "total_cost": 33.9, "total_runtime": 539.0}
5
+ {"agent_name": "0.9.0", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 44.2, "metric": "success_rate", "submission_time": "2025-11-24T23:38:02.231373", "tags": ["swt-bench"], "total_cost": 32.1, "total_runtime": 521.0}
simple_data_loader.py CHANGED
@@ -92,14 +92,13 @@ class SimpleLeaderboardViewer:
92
  first_record = agent_records.iloc[0]
93
  record = {
94
  # Core agent info - use final display names
95
- 'agent': agent_name, # Will become "Agent" after prettifying
96
- 'models used': first_record['llm_base'], # Will become "Models Used"
97
  'openness': first_record['openness'], # Will become "Openness"
98
  'agent tooling': first_record['tool_usage'], # Will become "Agent Tooling"
99
  'date': first_record['submission_time'], # Will become "Date"
100
  # Additional columns expected by the transformer
101
  'id': first_record.get('id', agent_name), # Will become "Id"
102
- 'submitter': first_record.get('submitter', 'Unknown'), # Will become "Submitter"
103
  'source': first_record.get('source', ''), # Will become "Source"
104
  'logs': first_record.get('logs', ''), # Will become "Logs"
105
  }
 
92
  first_record = agent_records.iloc[0]
93
  record = {
94
  # Core agent info - use final display names
95
+ 'agent': agent_name, # Will become "Agent Version" after prettifying
96
+ 'models used': first_record['llm_base'], # Will become "Model"
97
  'openness': first_record['openness'], # Will become "Openness"
98
  'agent tooling': first_record['tool_usage'], # Will become "Agent Tooling"
99
  'date': first_record['submission_time'], # Will become "Date"
100
  # Additional columns expected by the transformer
101
  'id': first_record.get('id', agent_name), # Will become "Id"
 
102
  'source': first_record.get('source', ''), # Will become "Source"
103
  'logs': first_record.get('logs', ''), # Will become "Logs"
104
  }