Spaces:
Running
Running
openhands
openhands
commited on
Commit
·
0ee2099
1
Parent(s):
5e9c3b9
Update UI: All-Hands-AI color scheme, agent version column names, and OpenHands logo
Browse files- Apply All-Hands-AI color scheme (yellow accent, indigo links, beige background)
- Change 'Agent' column to 'Agent Version'
- Remove 'Submitter' column
- Change 'Models Used' to 'Model'
- Update mock data with version numbers (1.0.2, 1.0.1, etc.)
- Replace logo with OpenHands branding
Co-authored-by: openhands <[email protected]>
- assets/logo.svg +17 -8
- content.py +27 -25
- data/extracted/agenteval.json +74 -0
- data/extracted/commit0.jsonl +5 -0
- data/extracted/gaia.jsonl +5 -0
- data/extracted/multi-swe-bench.jsonl +5 -0
- data/extracted/swe-bench-multimodal.jsonl +5 -0
- data/extracted/swe-bench.jsonl +5 -0
- data/extracted/swt-bench.jsonl +5 -0
- data/extracted/test.jsonl +30 -0
- data/extracted/test.parquet +0 -0
- data/extracted/validation.jsonl +30 -0
- data/extracted/validation.parquet +0 -0
- generate_mock_jsonl.py +5 -5
- leaderboard_transformer.py +2 -3
- mock_results/1.0.0-dev1/agenteval.json +30 -70
- mock_results/1.0.0-dev1/commit0.jsonl +5 -5
- mock_results/1.0.0-dev1/gaia.jsonl +5 -5
- mock_results/1.0.0-dev1/multi-swe-bench.jsonl +5 -5
- mock_results/1.0.0-dev1/swe-bench-multimodal.jsonl +5 -5
- mock_results/1.0.0-dev1/swe-bench.jsonl +5 -5
- mock_results/1.0.0-dev1/swt-bench.jsonl +5 -5
- simple_data_loader.py +2 -3
assets/logo.svg
CHANGED
|
|
|
|
content.py
CHANGED
|
@@ -30,7 +30,7 @@ INTRO_PARAGRAPH = """
|
|
| 30 |
</ul>
|
| 31 |
|
| 32 |
<p>
|
| 33 |
-
This view is designed for quick comparison of general-purpose coding agents. For more details on how we calculate scores and cost, please see the <a href="/about" style="color: #
|
| 34 |
</p>
|
| 35 |
"""
|
| 36 |
SCATTER_DISCLAIMER = """
|
|
@@ -237,14 +237,16 @@ def hf_uri_to_web_url(uri: str) -> str:
|
|
| 237 |
|
| 238 |
|
| 239 |
css = """
|
| 240 |
-
/* CSS Color Variables using
|
| 241 |
:root {
|
| 242 |
-
--color-primary-
|
| 243 |
-
--color-primary-
|
| 244 |
-
--color-neutral-light:
|
| 245 |
-
--color-background-light:
|
| 246 |
-
--color-background-dark:
|
| 247 |
-
--color-text-
|
|
|
|
|
|
|
| 248 |
}
|
| 249 |
|
| 250 |
/* This makes space for the huggingface header bar which must shown on HF spaces. */
|
|
@@ -350,13 +352,13 @@ table.gr-table {
|
|
| 350 |
overflow: visible !important;
|
| 351 |
}
|
| 352 |
#pareto-disclaimer {
|
| 353 |
-
color:
|
| 354 |
}
|
| 355 |
thead.svelte-1e98i6s th {
|
| 356 |
background: white !important;
|
| 357 |
}
|
| 358 |
.dark thead.svelte-1e98i6s th {
|
| 359 |
-
background:
|
| 360 |
}
|
| 361 |
.cell-wrap.svelte-v1pjjd {
|
| 362 |
font-family: 'Manrope';
|
|
@@ -376,10 +378,10 @@ nav.svelte-ti537g.svelte-ti537g {
|
|
| 376 |
position: relative !important;
|
| 377 |
}
|
| 378 |
.dark #leaderboard-accordion .label-wrap {
|
| 379 |
-
color:
|
| 380 |
}
|
| 381 |
.dark block.svelte-1svsvh2 {
|
| 382 |
-
background:
|
| 383 |
}
|
| 384 |
.padding.svelte-phx28p {
|
| 385 |
padding: 0 !important;
|
|
@@ -391,7 +393,7 @@ nav.svelte-ti537g.svelte-ti537g {
|
|
| 391 |
gap: 10px !important;
|
| 392 |
}
|
| 393 |
.dark .primary-link-button {
|
| 394 |
-
color: var(--color-primary-
|
| 395 |
}
|
| 396 |
.primary-link-button {
|
| 397 |
background: none;
|
|
@@ -400,7 +402,7 @@ nav.svelte-ti537g.svelte-ti537g {
|
|
| 400 |
margin: 0;
|
| 401 |
font-family: inherit;
|
| 402 |
font-size: 16px;
|
| 403 |
-
color: var(--color-primary-
|
| 404 |
text-decoration: none;
|
| 405 |
cursor: pointer;
|
| 406 |
white-space: nowrap;
|
|
@@ -446,8 +448,8 @@ nav.svelte-ti537g.svelte-ti537g {
|
|
| 446 |
content: attr(data-tooltip);
|
| 447 |
position: absolute;
|
| 448 |
bottom: 125%;
|
| 449 |
-
background-color:
|
| 450 |
-
color:
|
| 451 |
padding: 10px;
|
| 452 |
border-radius: 4px;
|
| 453 |
font-size: 12px;
|
|
@@ -569,17 +571,17 @@ span.wrap[tabindex="0"][role="button"][data-editable="false"] {
|
|
| 569 |
background-color: #1C3A3C;
|
| 570 |
}
|
| 571 |
.benchmark-main-subtitle{
|
| 572 |
-
color: var(--color-primary-
|
| 573 |
overflow: hidden;
|
| 574 |
padding-top: 120px;
|
| 575 |
}
|
| 576 |
.benchmark-title{
|
| 577 |
-
color: var(--color-primary-
|
| 578 |
margin-top: 50px;
|
| 579 |
font-size: 20px;
|
| 580 |
}
|
| 581 |
.dark .benchmark-title{
|
| 582 |
-
color: var(--color-primary-
|
| 583 |
}
|
| 584 |
.benchmark-description {
|
| 585 |
margin: 20px 0;
|
|
@@ -692,7 +694,7 @@ span.wrap[tabindex="0"][role="button"][data-editable="false"] {
|
|
| 692 |
|
| 693 |
#feedback-button {
|
| 694 |
display: inline-block;
|
| 695 |
-
background-color:
|
| 696 |
color: white;
|
| 697 |
border: none;
|
| 698 |
border-radius: 4px;
|
|
@@ -704,15 +706,15 @@ span.wrap[tabindex="0"][role="button"][data-editable="false"] {
|
|
| 704 |
}
|
| 705 |
|
| 706 |
#feedback-button:hover {
|
| 707 |
-
background-color:
|
| 708 |
transform: translateY(-2px);
|
| 709 |
box-shadow: 0 6px 12px rgba(0,0,0,0.3);
|
| 710 |
}
|
| 711 |
.dark #main-header h2 {
|
| 712 |
-
color:
|
| 713 |
}
|
| 714 |
#main-header h2 {
|
| 715 |
-
color:
|
| 716 |
}
|
| 717 |
|
| 718 |
/* --- New HTML-Based Tooltip Styles --- */
|
|
@@ -732,8 +734,8 @@ span.wrap[tabindex="0"][role="button"][data-editable="false"] {
|
|
| 732 |
/* Card appearance */
|
| 733 |
position: fixed;
|
| 734 |
z-index: 1000;
|
| 735 |
-
background-color:
|
| 736 |
-
color:
|
| 737 |
border-radius: 12px;
|
| 738 |
padding: 15px;
|
| 739 |
width: max-content;
|
|
|
|
| 30 |
</ul>
|
| 31 |
|
| 32 |
<p>
|
| 33 |
+
This view is designed for quick comparison of general-purpose coding agents. For more details on how we calculate scores and cost, please see the <a href="/about" style="color: #6366F1; text-decoration: underline;">About</a> Page.
|
| 34 |
</p>
|
| 35 |
"""
|
| 36 |
SCATTER_DISCLAIMER = """
|
|
|
|
| 237 |
|
| 238 |
|
| 239 |
css = """
|
| 240 |
+
/* CSS Color Variables using All-Hands-AI color scheme */
|
| 241 |
:root {
|
| 242 |
+
--color-primary-accent: #FFE165; /* Yellow accent from All-Hands-AI */
|
| 243 |
+
--color-primary-link: #6366F1; /* Indigo/purple links */
|
| 244 |
+
--color-neutral-light: #D4CABD; /* Tan placeholder */
|
| 245 |
+
--color-background-light: #F1EAE0; /* Beige/cream background */
|
| 246 |
+
--color-background-dark: #292A36; /* Dark gray */
|
| 247 |
+
--color-text-dark: #292A36; /* Dark gray text */
|
| 248 |
+
--color-text-light: #F1EAE0; /* Light text for dark backgrounds */
|
| 249 |
+
--color-button-hover: #4B5563; /* Darker gray for hover states */
|
| 250 |
}
|
| 251 |
|
| 252 |
/* This makes space for the huggingface header bar which must shown on HF spaces. */
|
|
|
|
| 352 |
overflow: visible !important;
|
| 353 |
}
|
| 354 |
#pareto-disclaimer {
|
| 355 |
+
color: var(--color-primary-accent) !important;
|
| 356 |
}
|
| 357 |
thead.svelte-1e98i6s th {
|
| 358 |
background: white !important;
|
| 359 |
}
|
| 360 |
.dark thead.svelte-1e98i6s th {
|
| 361 |
+
background: var(--color-background-dark) !important;
|
| 362 |
}
|
| 363 |
.cell-wrap.svelte-v1pjjd {
|
| 364 |
font-family: 'Manrope';
|
|
|
|
| 378 |
position: relative !important;
|
| 379 |
}
|
| 380 |
.dark #leaderboard-accordion .label-wrap {
|
| 381 |
+
color: var(--color-primary-accent) !important;
|
| 382 |
}
|
| 383 |
.dark block.svelte-1svsvh2 {
|
| 384 |
+
background: var(--color-background-dark) !important;
|
| 385 |
}
|
| 386 |
.padding.svelte-phx28p {
|
| 387 |
padding: 0 !important;
|
|
|
|
| 393 |
gap: 10px !important;
|
| 394 |
}
|
| 395 |
.dark .primary-link-button {
|
| 396 |
+
color: var(--color-primary-link);
|
| 397 |
}
|
| 398 |
.primary-link-button {
|
| 399 |
background: none;
|
|
|
|
| 402 |
margin: 0;
|
| 403 |
font-family: inherit;
|
| 404 |
font-size: 16px;
|
| 405 |
+
color: var(--color-primary-link);
|
| 406 |
text-decoration: none;
|
| 407 |
cursor: pointer;
|
| 408 |
white-space: nowrap;
|
|
|
|
| 448 |
content: attr(data-tooltip);
|
| 449 |
position: absolute;
|
| 450 |
bottom: 125%;
|
| 451 |
+
background-color: var(--color-background-dark);
|
| 452 |
+
color: var(--color-text-light);
|
| 453 |
padding: 10px;
|
| 454 |
border-radius: 4px;
|
| 455 |
font-size: 12px;
|
|
|
|
| 571 |
background-color: #1C3A3C;
|
| 572 |
}
|
| 573 |
.benchmark-main-subtitle{
|
| 574 |
+
color: var(--color-primary-link);
|
| 575 |
overflow: hidden;
|
| 576 |
padding-top: 120px;
|
| 577 |
}
|
| 578 |
.benchmark-title{
|
| 579 |
+
color: var(--color-primary-link);
|
| 580 |
margin-top: 50px;
|
| 581 |
font-size: 20px;
|
| 582 |
}
|
| 583 |
.dark .benchmark-title{
|
| 584 |
+
color: var(--color-primary-accent);
|
| 585 |
}
|
| 586 |
.benchmark-description {
|
| 587 |
margin: 20px 0;
|
|
|
|
| 694 |
|
| 695 |
#feedback-button {
|
| 696 |
display: inline-block;
|
| 697 |
+
background-color: var(--color-primary-link);
|
| 698 |
color: white;
|
| 699 |
border: none;
|
| 700 |
border-radius: 4px;
|
|
|
|
| 706 |
}
|
| 707 |
|
| 708 |
#feedback-button:hover {
|
| 709 |
+
background-color: var(--color-button-hover);
|
| 710 |
transform: translateY(-2px);
|
| 711 |
box-shadow: 0 6px 12px rgba(0,0,0,0.3);
|
| 712 |
}
|
| 713 |
.dark #main-header h2 {
|
| 714 |
+
color: var(--color-primary-accent);
|
| 715 |
}
|
| 716 |
#main-header h2 {
|
| 717 |
+
color: var(--color-primary-link);
|
| 718 |
}
|
| 719 |
|
| 720 |
/* --- New HTML-Based Tooltip Styles --- */
|
|
|
|
| 734 |
/* Card appearance */
|
| 735 |
position: fixed;
|
| 736 |
z-index: 1000;
|
| 737 |
+
background-color: var(--color-background-dark);
|
| 738 |
+
color: var(--color-text-light);
|
| 739 |
border-radius: 12px;
|
| 740 |
padding: 15px;
|
| 741 |
width: max-content;
|
data/extracted/agenteval.json
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"suite_config": {
|
| 3 |
+
"name": "openhands-index",
|
| 4 |
+
"version": "1.0.0-dev1",
|
| 5 |
+
"splits": [
|
| 6 |
+
{
|
| 7 |
+
"name": "swe-bench",
|
| 8 |
+
"tasks": [
|
| 9 |
+
{
|
| 10 |
+
"name": "swe-bench",
|
| 11 |
+
"tags": [
|
| 12 |
+
"swe-bench"
|
| 13 |
+
]
|
| 14 |
+
}
|
| 15 |
+
]
|
| 16 |
+
},
|
| 17 |
+
{
|
| 18 |
+
"name": "multi-swe-bench",
|
| 19 |
+
"tasks": [
|
| 20 |
+
{
|
| 21 |
+
"name": "multi-swe-bench",
|
| 22 |
+
"tags": [
|
| 23 |
+
"multi-swe-bench"
|
| 24 |
+
]
|
| 25 |
+
}
|
| 26 |
+
]
|
| 27 |
+
},
|
| 28 |
+
{
|
| 29 |
+
"name": "swe-bench-multimodal",
|
| 30 |
+
"tasks": [
|
| 31 |
+
{
|
| 32 |
+
"name": "swe-bench-multimodal",
|
| 33 |
+
"tags": [
|
| 34 |
+
"swe-bench-multimodal"
|
| 35 |
+
]
|
| 36 |
+
}
|
| 37 |
+
]
|
| 38 |
+
},
|
| 39 |
+
{
|
| 40 |
+
"name": "swt-bench",
|
| 41 |
+
"tasks": [
|
| 42 |
+
{
|
| 43 |
+
"name": "swt-bench",
|
| 44 |
+
"tags": [
|
| 45 |
+
"swt-bench"
|
| 46 |
+
]
|
| 47 |
+
}
|
| 48 |
+
]
|
| 49 |
+
},
|
| 50 |
+
{
|
| 51 |
+
"name": "commit0",
|
| 52 |
+
"tasks": [
|
| 53 |
+
{
|
| 54 |
+
"name": "commit0",
|
| 55 |
+
"tags": [
|
| 56 |
+
"commit0"
|
| 57 |
+
]
|
| 58 |
+
}
|
| 59 |
+
]
|
| 60 |
+
},
|
| 61 |
+
{
|
| 62 |
+
"name": "gaia",
|
| 63 |
+
"tasks": [
|
| 64 |
+
{
|
| 65 |
+
"name": "gaia",
|
| 66 |
+
"tags": [
|
| 67 |
+
"gaia"
|
| 68 |
+
]
|
| 69 |
+
}
|
| 70 |
+
]
|
| 71 |
+
}
|
| 72 |
+
]
|
| 73 |
+
}
|
| 74 |
+
}
|
data/extracted/commit0.jsonl
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"agent_name": "1.0.2", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 71.2, "metric": "test_pass_rate", "submission_time": "2025-11-24T23:38:02.231539", "tags": ["commit0"], "total_cost": 45.6, "total_runtime": 656.0}
|
| 2 |
+
{"agent_name": "1.0.1", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 68.9, "metric": "test_pass_rate", "submission_time": "2025-11-24T23:38:02.231556", "tags": ["commit0"], "total_cost": 44.45, "total_runtime": 644.5}
|
| 3 |
+
{"agent_name": "1.0.0", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 61.5, "metric": "test_pass_rate", "submission_time": "2025-11-24T23:38:02.231563", "tags": ["commit0"], "total_cost": 40.75, "total_runtime": 607.5}
|
| 4 |
+
{"agent_name": "0.9.5", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 55.3, "metric": "test_pass_rate", "submission_time": "2025-11-24T23:38:02.231569", "tags": ["commit0"], "total_cost": 37.65, "total_runtime": 576.5}
|
| 5 |
+
{"agent_name": "0.9.0", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 52.1, "metric": "test_pass_rate", "submission_time": "2025-11-24T23:38:02.231574", "tags": ["commit0"], "total_cost": 36.05, "total_runtime": 560.5}
|
data/extracted/gaia.jsonl
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"agent_name": "1.0.2", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 58.7, "metric": "accuracy", "submission_time": "2025-11-24T23:38:02.231711", "tags": ["gaia"], "total_cost": 39.35, "total_runtime": 593.5}
|
| 2 |
+
{"agent_name": "1.0.1", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 55.2, "metric": "accuracy", "submission_time": "2025-11-24T23:38:02.231728", "tags": ["gaia"], "total_cost": 37.6, "total_runtime": 576.0}
|
| 3 |
+
{"agent_name": "1.0.0", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 48.3, "metric": "accuracy", "submission_time": "2025-11-24T23:38:02.231735", "tags": ["gaia"], "total_cost": 34.15, "total_runtime": 541.5}
|
| 4 |
+
{"agent_name": "0.9.5", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 42.1, "metric": "accuracy", "submission_time": "2025-11-24T23:38:02.231741", "tags": ["gaia"], "total_cost": 31.05, "total_runtime": 510.5}
|
| 5 |
+
{"agent_name": "0.9.0", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 39.4, "metric": "accuracy", "submission_time": "2025-11-24T23:38:02.231749", "tags": ["gaia"], "total_cost": 29.7, "total_runtime": 497.0}
|
data/extracted/multi-swe-bench.jsonl
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"agent_name": "1.0.2", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 35.2, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.230869", "tags": ["multi-swe-bench"], "total_cost": 27.6, "total_runtime": 476.0}
|
| 2 |
+
{"agent_name": "1.0.1", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 32.8, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.230889", "tags": ["multi-swe-bench"], "total_cost": 26.4, "total_runtime": 464.0}
|
| 3 |
+
{"agent_name": "1.0.0", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 28.4, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.230899", "tags": ["multi-swe-bench"], "total_cost": 24.2, "total_runtime": 442.0}
|
| 4 |
+
{"agent_name": "0.9.5", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 24.1, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.230908", "tags": ["multi-swe-bench"], "total_cost": 22.05, "total_runtime": 420.5}
|
| 5 |
+
{"agent_name": "0.9.0", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 21.5, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.230917", "tags": ["multi-swe-bench"], "total_cost": 20.75, "total_runtime": 407.5}
|
data/extracted/swe-bench-multimodal.jsonl
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"agent_name": "1.0.2", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 42.1, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.231057", "tags": ["swe-bench-multimodal"], "total_cost": 31.05, "total_runtime": 510.5}
|
| 2 |
+
{"agent_name": "1.0.1", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 39.5, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.231073", "tags": ["swe-bench-multimodal"], "total_cost": 29.75, "total_runtime": 497.5}
|
| 3 |
+
{"agent_name": "1.0.0", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 34.2, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.231082", "tags": ["swe-bench-multimodal"], "total_cost": 27.1, "total_runtime": 471.0}
|
| 4 |
+
{"agent_name": "0.9.5", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 28.9, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.231088", "tags": ["swe-bench-multimodal"], "total_cost": 24.45, "total_runtime": 444.5}
|
| 5 |
+
{"agent_name": "0.9.0", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 25.7, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.231093", "tags": ["swe-bench-multimodal"], "total_cost": 22.85, "total_runtime": 428.5}
|
data/extracted/swe-bench.jsonl
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"agent_name": "1.0.2", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 48.3, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.230638", "tags": ["swe-bench"], "total_cost": 34.15, "total_runtime": 541.5}
|
| 2 |
+
{"agent_name": "1.0.1", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 45.1, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.230668", "tags": ["swe-bench"], "total_cost": 32.55, "total_runtime": 525.5}
|
| 3 |
+
{"agent_name": "1.0.0", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 38.7, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.230681", "tags": ["swe-bench"], "total_cost": 29.35, "total_runtime": 493.5}
|
| 4 |
+
{"agent_name": "0.9.5", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 32.5, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.230689", "tags": ["swe-bench"], "total_cost": 26.25, "total_runtime": 462.5}
|
| 5 |
+
{"agent_name": "0.9.0", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 29.8, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.230696", "tags": ["swe-bench"], "total_cost": 24.9, "total_runtime": 449.0}
|
data/extracted/swt-bench.jsonl
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"agent_name": "1.0.2", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 65.4, "metric": "success_rate", "submission_time": "2025-11-24T23:38:02.231319", "tags": ["swt-bench"], "total_cost": 42.7, "total_runtime": 627.0}
|
| 2 |
+
{"agent_name": "1.0.1", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 62.3, "metric": "success_rate", "submission_time": "2025-11-24T23:38:02.231344", "tags": ["swt-bench"], "total_cost": 41.15, "total_runtime": 611.5}
|
| 3 |
+
{"agent_name": "1.0.0", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 54.1, "metric": "success_rate", "submission_time": "2025-11-24T23:38:02.231356", "tags": ["swt-bench"], "total_cost": 37.05, "total_runtime": 570.5}
|
| 4 |
+
{"agent_name": "0.9.5", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 47.8, "metric": "success_rate", "submission_time": "2025-11-24T23:38:02.231365", "tags": ["swt-bench"], "total_cost": 33.9, "total_runtime": 539.0}
|
| 5 |
+
{"agent_name": "0.9.0", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 44.2, "metric": "success_rate", "submission_time": "2025-11-24T23:38:02.231373", "tags": ["swt-bench"], "total_cost": 32.1, "total_runtime": 521.0}
|
data/extracted/test.jsonl
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 48.3, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.092865", "tags": ["swe-bench"], "total_cost": 34.15, "total_runtime": 541.5}
|
| 2 |
+
{"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 45.1, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.092895", "tags": ["swe-bench"], "total_cost": 32.55, "total_runtime": 525.5}
|
| 3 |
+
{"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 38.7, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.092908", "tags": ["swe-bench"], "total_cost": 29.35, "total_runtime": 493.5}
|
| 4 |
+
{"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 32.5, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.092916", "tags": ["swe-bench"], "total_cost": 26.25, "total_runtime": 462.5}
|
| 5 |
+
{"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 29.8, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.092922", "tags": ["swe-bench"], "total_cost": 24.9, "total_runtime": 449.0}
|
| 6 |
+
{"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 35.2, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093026", "tags": ["multi-swe-bench"], "total_cost": 27.6, "total_runtime": 476.0}
|
| 7 |
+
{"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 32.8, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093040", "tags": ["multi-swe-bench"], "total_cost": 26.4, "total_runtime": 464.0}
|
| 8 |
+
{"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 28.4, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093048", "tags": ["multi-swe-bench"], "total_cost": 24.2, "total_runtime": 442.0}
|
| 9 |
+
{"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 24.1, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093058", "tags": ["multi-swe-bench"], "total_cost": 22.05, "total_runtime": 420.5}
|
| 10 |
+
{"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 21.5, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093067", "tags": ["multi-swe-bench"], "total_cost": 20.75, "total_runtime": 407.5}
|
| 11 |
+
{"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 42.1, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093172", "tags": ["swe-bench-multimodal"], "total_cost": 31.05, "total_runtime": 510.5}
|
| 12 |
+
{"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 39.5, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093187", "tags": ["swe-bench-multimodal"], "total_cost": 29.75, "total_runtime": 497.5}
|
| 13 |
+
{"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 34.2, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093200", "tags": ["swe-bench-multimodal"], "total_cost": 27.1, "total_runtime": 471.0}
|
| 14 |
+
{"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 28.9, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093208", "tags": ["swe-bench-multimodal"], "total_cost": 24.45, "total_runtime": 444.5}
|
| 15 |
+
{"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 25.7, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093218", "tags": ["swe-bench-multimodal"], "total_cost": 22.85, "total_runtime": 428.5}
|
| 16 |
+
{"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 65.4, "metric": "success_rate", "submission_time": "2025-11-24T19:56:00.093304", "tags": ["swt-bench"], "total_cost": 42.7, "total_runtime": 627.0}
|
| 17 |
+
{"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 62.3, "metric": "success_rate", "submission_time": "2025-11-24T19:56:00.093317", "tags": ["swt-bench"], "total_cost": 41.15, "total_runtime": 611.5}
|
| 18 |
+
{"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 54.1, "metric": "success_rate", "submission_time": "2025-11-24T19:56:00.093326", "tags": ["swt-bench"], "total_cost": 37.05, "total_runtime": 570.5}
|
| 19 |
+
{"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 47.8, "metric": "success_rate", "submission_time": "2025-11-24T19:56:00.093333", "tags": ["swt-bench"], "total_cost": 33.9, "total_runtime": 539.0}
|
| 20 |
+
{"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 44.2, "metric": "success_rate", "submission_time": "2025-11-24T19:56:00.093338", "tags": ["swt-bench"], "total_cost": 32.1, "total_runtime": 521.0}
|
| 21 |
+
{"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 71.2, "metric": "test_pass_rate", "submission_time": "2025-11-24T19:56:00.093416", "tags": ["commit0"], "total_cost": 45.6, "total_runtime": 656.0}
|
| 22 |
+
{"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 68.9, "metric": "test_pass_rate", "submission_time": "2025-11-24T19:56:00.093428", "tags": ["commit0"], "total_cost": 44.45, "total_runtime": 644.5}
|
| 23 |
+
{"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 61.5, "metric": "test_pass_rate", "submission_time": "2025-11-24T19:56:00.093436", "tags": ["commit0"], "total_cost": 40.75, "total_runtime": 607.5}
|
| 24 |
+
{"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 55.3, "metric": "test_pass_rate", "submission_time": "2025-11-24T19:56:00.093445", "tags": ["commit0"], "total_cost": 37.65, "total_runtime": 576.5}
|
| 25 |
+
{"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 52.1, "metric": "test_pass_rate", "submission_time": "2025-11-24T19:56:00.093450", "tags": ["commit0"], "total_cost": 36.05, "total_runtime": 560.5}
|
| 26 |
+
{"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 58.7, "metric": "accuracy", "submission_time": "2025-11-24T19:56:00.093541", "tags": ["gaia"], "total_cost": 39.35, "total_runtime": 593.5}
|
| 27 |
+
{"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 55.2, "metric": "accuracy", "submission_time": "2025-11-24T19:56:00.093556", "tags": ["gaia"], "total_cost": 37.6, "total_runtime": 576.0}
|
| 28 |
+
{"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 48.3, "metric": "accuracy", "submission_time": "2025-11-24T19:56:00.093563", "tags": ["gaia"], "total_cost": 34.15, "total_runtime": 541.5}
|
| 29 |
+
{"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 42.1, "metric": "accuracy", "submission_time": "2025-11-24T19:56:00.093568", "tags": ["gaia"], "total_cost": 31.05, "total_runtime": 510.5}
|
| 30 |
+
{"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 39.4, "metric": "accuracy", "submission_time": "2025-11-24T19:56:00.093574", "tags": ["gaia"], "total_cost": 29.7, "total_runtime": 497.0}
|
data/extracted/test.parquet
ADDED
|
Binary file (9.26 kB). View file
|
|
|
data/extracted/validation.jsonl
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 48.3, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.092865", "tags": ["swe-bench"], "total_cost": 34.15, "total_runtime": 541.5}
|
| 2 |
+
{"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 45.1, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.092895", "tags": ["swe-bench"], "total_cost": 32.55, "total_runtime": 525.5}
|
| 3 |
+
{"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 38.7, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.092908", "tags": ["swe-bench"], "total_cost": 29.35, "total_runtime": 493.5}
|
| 4 |
+
{"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 32.5, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.092916", "tags": ["swe-bench"], "total_cost": 26.25, "total_runtime": 462.5}
|
| 5 |
+
{"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 29.8, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.092922", "tags": ["swe-bench"], "total_cost": 24.9, "total_runtime": 449.0}
|
| 6 |
+
{"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 35.2, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093026", "tags": ["multi-swe-bench"], "total_cost": 27.6, "total_runtime": 476.0}
|
| 7 |
+
{"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 32.8, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093040", "tags": ["multi-swe-bench"], "total_cost": 26.4, "total_runtime": 464.0}
|
| 8 |
+
{"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 28.4, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093048", "tags": ["multi-swe-bench"], "total_cost": 24.2, "total_runtime": 442.0}
|
| 9 |
+
{"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 24.1, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093058", "tags": ["multi-swe-bench"], "total_cost": 22.05, "total_runtime": 420.5}
|
| 10 |
+
{"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 21.5, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093067", "tags": ["multi-swe-bench"], "total_cost": 20.75, "total_runtime": 407.5}
|
| 11 |
+
{"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 42.1, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093172", "tags": ["swe-bench-multimodal"], "total_cost": 31.05, "total_runtime": 510.5}
|
| 12 |
+
{"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 39.5, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093187", "tags": ["swe-bench-multimodal"], "total_cost": 29.75, "total_runtime": 497.5}
|
| 13 |
+
{"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 34.2, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093200", "tags": ["swe-bench-multimodal"], "total_cost": 27.1, "total_runtime": 471.0}
|
| 14 |
+
{"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 28.9, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093208", "tags": ["swe-bench-multimodal"], "total_cost": 24.45, "total_runtime": 444.5}
|
| 15 |
+
{"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 25.7, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093218", "tags": ["swe-bench-multimodal"], "total_cost": 22.85, "total_runtime": 428.5}
|
| 16 |
+
{"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 65.4, "metric": "success_rate", "submission_time": "2025-11-24T19:56:00.093304", "tags": ["swt-bench"], "total_cost": 42.7, "total_runtime": 627.0}
|
| 17 |
+
{"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 62.3, "metric": "success_rate", "submission_time": "2025-11-24T19:56:00.093317", "tags": ["swt-bench"], "total_cost": 41.15, "total_runtime": 611.5}
|
| 18 |
+
{"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 54.1, "metric": "success_rate", "submission_time": "2025-11-24T19:56:00.093326", "tags": ["swt-bench"], "total_cost": 37.05, "total_runtime": 570.5}
|
| 19 |
+
{"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 47.8, "metric": "success_rate", "submission_time": "2025-11-24T19:56:00.093333", "tags": ["swt-bench"], "total_cost": 33.9, "total_runtime": 539.0}
|
| 20 |
+
{"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 44.2, "metric": "success_rate", "submission_time": "2025-11-24T19:56:00.093338", "tags": ["swt-bench"], "total_cost": 32.1, "total_runtime": 521.0}
|
| 21 |
+
{"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 71.2, "metric": "test_pass_rate", "submission_time": "2025-11-24T19:56:00.093416", "tags": ["commit0"], "total_cost": 45.6, "total_runtime": 656.0}
|
| 22 |
+
{"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 68.9, "metric": "test_pass_rate", "submission_time": "2025-11-24T19:56:00.093428", "tags": ["commit0"], "total_cost": 44.45, "total_runtime": 644.5}
|
| 23 |
+
{"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 61.5, "metric": "test_pass_rate", "submission_time": "2025-11-24T19:56:00.093436", "tags": ["commit0"], "total_cost": 40.75, "total_runtime": 607.5}
|
| 24 |
+
{"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 55.3, "metric": "test_pass_rate", "submission_time": "2025-11-24T19:56:00.093445", "tags": ["commit0"], "total_cost": 37.65, "total_runtime": 576.5}
|
| 25 |
+
{"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 52.1, "metric": "test_pass_rate", "submission_time": "2025-11-24T19:56:00.093450", "tags": ["commit0"], "total_cost": 36.05, "total_runtime": 560.5}
|
| 26 |
+
{"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 58.7, "metric": "accuracy", "submission_time": "2025-11-24T19:56:00.093541", "tags": ["gaia"], "total_cost": 39.35, "total_runtime": 593.5}
|
| 27 |
+
{"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 55.2, "metric": "accuracy", "submission_time": "2025-11-24T19:56:00.093556", "tags": ["gaia"], "total_cost": 37.6, "total_runtime": 576.0}
|
| 28 |
+
{"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 48.3, "metric": "accuracy", "submission_time": "2025-11-24T19:56:00.093563", "tags": ["gaia"], "total_cost": 34.15, "total_runtime": 541.5}
|
| 29 |
+
{"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 42.1, "metric": "accuracy", "submission_time": "2025-11-24T19:56:00.093568", "tags": ["gaia"], "total_cost": 31.05, "total_runtime": 510.5}
|
| 30 |
+
{"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 39.4, "metric": "accuracy", "submission_time": "2025-11-24T19:56:00.093574", "tags": ["gaia"], "total_cost": 29.7, "total_runtime": 497.0}
|
data/extracted/validation.parquet
ADDED
|
Binary file (9.29 kB). View file
|
|
|
generate_mock_jsonl.py
CHANGED
|
@@ -41,7 +41,7 @@ BENCHMARKS = {
|
|
| 41 |
# Mock agents with realistic scores
|
| 42 |
MOCK_AGENTS = [
|
| 43 |
{
|
| 44 |
-
"agent_name": "
|
| 45 |
"llm_base": "claude-3-5-sonnet-20241022",
|
| 46 |
"openness": "closed_api_available",
|
| 47 |
"tool_usage": "standard",
|
|
@@ -55,7 +55,7 @@ MOCK_AGENTS = [
|
|
| 55 |
}
|
| 56 |
},
|
| 57 |
{
|
| 58 |
-
"agent_name": "
|
| 59 |
"llm_base": "gpt-4o-2024-11-20",
|
| 60 |
"openness": "closed_api_available",
|
| 61 |
"tool_usage": "standard",
|
|
@@ -69,7 +69,7 @@ MOCK_AGENTS = [
|
|
| 69 |
}
|
| 70 |
},
|
| 71 |
{
|
| 72 |
-
"agent_name": "
|
| 73 |
"llm_base": "gpt-4-turbo-2024-04-09",
|
| 74 |
"openness": "closed_api_available",
|
| 75 |
"tool_usage": "standard",
|
|
@@ -83,7 +83,7 @@ MOCK_AGENTS = [
|
|
| 83 |
}
|
| 84 |
},
|
| 85 |
{
|
| 86 |
-
"agent_name": "
|
| 87 |
"llm_base": "gpt-4o-mini-2024-07-18",
|
| 88 |
"openness": "closed_api_available",
|
| 89 |
"tool_usage": "standard",
|
|
@@ -97,7 +97,7 @@ MOCK_AGENTS = [
|
|
| 97 |
}
|
| 98 |
},
|
| 99 |
{
|
| 100 |
-
"agent_name": "
|
| 101 |
"llm_base": "claude-3-opus-20240229",
|
| 102 |
"openness": "closed_api_available",
|
| 103 |
"tool_usage": "custom_interface",
|
|
|
|
| 41 |
# Mock agents with realistic scores
|
| 42 |
MOCK_AGENTS = [
|
| 43 |
{
|
| 44 |
+
"agent_name": "1.0.2",
|
| 45 |
"llm_base": "claude-3-5-sonnet-20241022",
|
| 46 |
"openness": "closed_api_available",
|
| 47 |
"tool_usage": "standard",
|
|
|
|
| 55 |
}
|
| 56 |
},
|
| 57 |
{
|
| 58 |
+
"agent_name": "1.0.1",
|
| 59 |
"llm_base": "gpt-4o-2024-11-20",
|
| 60 |
"openness": "closed_api_available",
|
| 61 |
"tool_usage": "standard",
|
|
|
|
| 69 |
}
|
| 70 |
},
|
| 71 |
{
|
| 72 |
+
"agent_name": "1.0.0",
|
| 73 |
"llm_base": "gpt-4-turbo-2024-04-09",
|
| 74 |
"openness": "closed_api_available",
|
| 75 |
"tool_usage": "standard",
|
|
|
|
| 83 |
}
|
| 84 |
},
|
| 85 |
{
|
| 86 |
+
"agent_name": "0.9.5",
|
| 87 |
"llm_base": "gpt-4o-mini-2024-07-18",
|
| 88 |
"openness": "closed_api_available",
|
| 89 |
"tool_usage": "standard",
|
|
|
|
| 97 |
}
|
| 98 |
},
|
| 99 |
{
|
| 100 |
+
"agent_name": "0.9.0",
|
| 101 |
"llm_base": "claude-3-opus-20240229",
|
| 102 |
"openness": "closed_api_available",
|
| 103 |
"tool_usage": "custom_interface",
|
leaderboard_transformer.py
CHANGED
|
@@ -103,16 +103,15 @@ def _pretty_column_name(raw_col: str) -> str:
|
|
| 103 |
# Case 1: Handle fixed, special-case mappings first.
|
| 104 |
fixed_mappings = {
|
| 105 |
'id': 'id',
|
| 106 |
-
'Agent': 'Agent',
|
| 107 |
'Agent description': 'Agent Description',
|
| 108 |
-
'User/organization': 'Submitter',
|
| 109 |
'Submission date': 'Date',
|
| 110 |
'Overall': 'Overall Score',
|
| 111 |
'Overall cost': 'Overall Cost',
|
| 112 |
'Logs': 'Logs',
|
| 113 |
'Openness': 'Openness',
|
| 114 |
'Agent tooling': 'Agent Tooling',
|
| 115 |
-
'LLM base': '
|
| 116 |
'Source': 'Source',
|
| 117 |
}
|
| 118 |
|
|
|
|
| 103 |
# Case 1: Handle fixed, special-case mappings first.
|
| 104 |
fixed_mappings = {
|
| 105 |
'id': 'id',
|
| 106 |
+
'Agent': 'Agent Version',
|
| 107 |
'Agent description': 'Agent Description',
|
|
|
|
| 108 |
'Submission date': 'Date',
|
| 109 |
'Overall': 'Overall Score',
|
| 110 |
'Overall cost': 'Overall Cost',
|
| 111 |
'Logs': 'Logs',
|
| 112 |
'Openness': 'Openness',
|
| 113 |
'Agent tooling': 'Agent Tooling',
|
| 114 |
+
'LLM base': 'Model',
|
| 115 |
'Source': 'Source',
|
| 116 |
}
|
| 117 |
|
mock_results/1.0.0-dev1/agenteval.json
CHANGED
|
@@ -4,106 +4,66 @@
|
|
| 4 |
"version": "1.0.0-dev1",
|
| 5 |
"splits": [
|
| 6 |
{
|
| 7 |
-
"name": "
|
| 8 |
"tasks": [
|
| 9 |
{
|
| 10 |
"name": "swe-bench",
|
| 11 |
"tags": [
|
| 12 |
-
"Overall",
|
| 13 |
-
"Bug Fixing",
|
| 14 |
"swe-bench"
|
| 15 |
]
|
| 16 |
-
}
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
"swe-bench-multimodal"
|
| 23 |
-
]
|
| 24 |
-
},
|
| 25 |
-
{
|
| 26 |
-
"name": "commit0",
|
| 27 |
-
"tags": [
|
| 28 |
-
"Overall",
|
| 29 |
-
"App Creation",
|
| 30 |
-
"commit0"
|
| 31 |
-
]
|
| 32 |
-
},
|
| 33 |
{
|
| 34 |
"name": "multi-swe-bench",
|
| 35 |
"tags": [
|
| 36 |
-
"Overall",
|
| 37 |
-
"Frontend Development",
|
| 38 |
"multi-swe-bench"
|
| 39 |
]
|
| 40 |
-
},
|
| 41 |
-
{
|
| 42 |
-
"name": "swt-bench",
|
| 43 |
-
"tags": [
|
| 44 |
-
"Overall",
|
| 45 |
-
"Test Generation",
|
| 46 |
-
"swt-bench"
|
| 47 |
-
]
|
| 48 |
-
},
|
| 49 |
-
{
|
| 50 |
-
"name": "gaia",
|
| 51 |
-
"tags": [
|
| 52 |
-
"Overall",
|
| 53 |
-
"Information Gathering",
|
| 54 |
-
"gaia"
|
| 55 |
-
]
|
| 56 |
}
|
| 57 |
]
|
| 58 |
},
|
| 59 |
{
|
| 60 |
-
"name": "
|
| 61 |
"tasks": [
|
| 62 |
-
{
|
| 63 |
-
"name": "swe-bench",
|
| 64 |
-
"tags": [
|
| 65 |
-
"Overall",
|
| 66 |
-
"Bug Fixing",
|
| 67 |
-
"swe-bench"
|
| 68 |
-
]
|
| 69 |
-
},
|
| 70 |
{
|
| 71 |
"name": "swe-bench-multimodal",
|
| 72 |
"tags": [
|
| 73 |
-
"Overall",
|
| 74 |
-
"Bug Fixing",
|
| 75 |
"swe-bench-multimodal"
|
| 76 |
]
|
| 77 |
-
}
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
"commit0"
|
| 84 |
-
]
|
| 85 |
-
},
|
| 86 |
{
|
| 87 |
-
"name": "
|
| 88 |
"tags": [
|
| 89 |
-
"
|
| 90 |
-
"Frontend Development",
|
| 91 |
-
"multi-swe-bench"
|
| 92 |
]
|
| 93 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
{
|
| 95 |
-
"name": "
|
| 96 |
"tags": [
|
| 97 |
-
"
|
| 98 |
-
"Test Generation",
|
| 99 |
-
"swt-bench"
|
| 100 |
]
|
| 101 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 102 |
{
|
| 103 |
"name": "gaia",
|
| 104 |
"tags": [
|
| 105 |
-
"Overall",
|
| 106 |
-
"Information Gathering",
|
| 107 |
"gaia"
|
| 108 |
]
|
| 109 |
}
|
|
|
|
| 4 |
"version": "1.0.0-dev1",
|
| 5 |
"splits": [
|
| 6 |
{
|
| 7 |
+
"name": "swe-bench",
|
| 8 |
"tasks": [
|
| 9 |
{
|
| 10 |
"name": "swe-bench",
|
| 11 |
"tags": [
|
|
|
|
|
|
|
| 12 |
"swe-bench"
|
| 13 |
]
|
| 14 |
+
}
|
| 15 |
+
]
|
| 16 |
+
},
|
| 17 |
+
{
|
| 18 |
+
"name": "multi-swe-bench",
|
| 19 |
+
"tasks": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
{
|
| 21 |
"name": "multi-swe-bench",
|
| 22 |
"tags": [
|
|
|
|
|
|
|
| 23 |
"multi-swe-bench"
|
| 24 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
}
|
| 26 |
]
|
| 27 |
},
|
| 28 |
{
|
| 29 |
+
"name": "swe-bench-multimodal",
|
| 30 |
"tasks": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
{
|
| 32 |
"name": "swe-bench-multimodal",
|
| 33 |
"tags": [
|
|
|
|
|
|
|
| 34 |
"swe-bench-multimodal"
|
| 35 |
]
|
| 36 |
+
}
|
| 37 |
+
]
|
| 38 |
+
},
|
| 39 |
+
{
|
| 40 |
+
"name": "swt-bench",
|
| 41 |
+
"tasks": [
|
|
|
|
|
|
|
|
|
|
| 42 |
{
|
| 43 |
+
"name": "swt-bench",
|
| 44 |
"tags": [
|
| 45 |
+
"swt-bench"
|
|
|
|
|
|
|
| 46 |
]
|
| 47 |
+
}
|
| 48 |
+
]
|
| 49 |
+
},
|
| 50 |
+
{
|
| 51 |
+
"name": "commit0",
|
| 52 |
+
"tasks": [
|
| 53 |
{
|
| 54 |
+
"name": "commit0",
|
| 55 |
"tags": [
|
| 56 |
+
"commit0"
|
|
|
|
|
|
|
| 57 |
]
|
| 58 |
+
}
|
| 59 |
+
]
|
| 60 |
+
},
|
| 61 |
+
{
|
| 62 |
+
"name": "gaia",
|
| 63 |
+
"tasks": [
|
| 64 |
{
|
| 65 |
"name": "gaia",
|
| 66 |
"tags": [
|
|
|
|
|
|
|
| 67 |
"gaia"
|
| 68 |
]
|
| 69 |
}
|
mock_results/1.0.0-dev1/commit0.jsonl
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
-
{"agent_name": "
|
| 2 |
-
{"agent_name": "
|
| 3 |
-
{"agent_name": "
|
| 4 |
-
{"agent_name": "
|
| 5 |
-
{"agent_name": "
|
|
|
|
| 1 |
+
{"agent_name": "1.0.2", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 71.2, "metric": "test_pass_rate", "submission_time": "2025-11-24T23:38:02.231539", "tags": ["commit0"], "total_cost": 45.6, "total_runtime": 656.0}
|
| 2 |
+
{"agent_name": "1.0.1", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 68.9, "metric": "test_pass_rate", "submission_time": "2025-11-24T23:38:02.231556", "tags": ["commit0"], "total_cost": 44.45, "total_runtime": 644.5}
|
| 3 |
+
{"agent_name": "1.0.0", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 61.5, "metric": "test_pass_rate", "submission_time": "2025-11-24T23:38:02.231563", "tags": ["commit0"], "total_cost": 40.75, "total_runtime": 607.5}
|
| 4 |
+
{"agent_name": "0.9.5", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 55.3, "metric": "test_pass_rate", "submission_time": "2025-11-24T23:38:02.231569", "tags": ["commit0"], "total_cost": 37.65, "total_runtime": 576.5}
|
| 5 |
+
{"agent_name": "0.9.0", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 52.1, "metric": "test_pass_rate", "submission_time": "2025-11-24T23:38:02.231574", "tags": ["commit0"], "total_cost": 36.05, "total_runtime": 560.5}
|
mock_results/1.0.0-dev1/gaia.jsonl
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
-
{"agent_name": "
|
| 2 |
-
{"agent_name": "
|
| 3 |
-
{"agent_name": "
|
| 4 |
-
{"agent_name": "
|
| 5 |
-
{"agent_name": "
|
|
|
|
| 1 |
+
{"agent_name": "1.0.2", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 58.7, "metric": "accuracy", "submission_time": "2025-11-24T23:38:02.231711", "tags": ["gaia"], "total_cost": 39.35, "total_runtime": 593.5}
|
| 2 |
+
{"agent_name": "1.0.1", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 55.2, "metric": "accuracy", "submission_time": "2025-11-24T23:38:02.231728", "tags": ["gaia"], "total_cost": 37.6, "total_runtime": 576.0}
|
| 3 |
+
{"agent_name": "1.0.0", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 48.3, "metric": "accuracy", "submission_time": "2025-11-24T23:38:02.231735", "tags": ["gaia"], "total_cost": 34.15, "total_runtime": 541.5}
|
| 4 |
+
{"agent_name": "0.9.5", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 42.1, "metric": "accuracy", "submission_time": "2025-11-24T23:38:02.231741", "tags": ["gaia"], "total_cost": 31.05, "total_runtime": 510.5}
|
| 5 |
+
{"agent_name": "0.9.0", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 39.4, "metric": "accuracy", "submission_time": "2025-11-24T23:38:02.231749", "tags": ["gaia"], "total_cost": 29.7, "total_runtime": 497.0}
|
mock_results/1.0.0-dev1/multi-swe-bench.jsonl
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
-
{"agent_name": "
|
| 2 |
-
{"agent_name": "
|
| 3 |
-
{"agent_name": "
|
| 4 |
-
{"agent_name": "
|
| 5 |
-
{"agent_name": "
|
|
|
|
| 1 |
+
{"agent_name": "1.0.2", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 35.2, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.230869", "tags": ["multi-swe-bench"], "total_cost": 27.6, "total_runtime": 476.0}
|
| 2 |
+
{"agent_name": "1.0.1", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 32.8, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.230889", "tags": ["multi-swe-bench"], "total_cost": 26.4, "total_runtime": 464.0}
|
| 3 |
+
{"agent_name": "1.0.0", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 28.4, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.230899", "tags": ["multi-swe-bench"], "total_cost": 24.2, "total_runtime": 442.0}
|
| 4 |
+
{"agent_name": "0.9.5", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 24.1, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.230908", "tags": ["multi-swe-bench"], "total_cost": 22.05, "total_runtime": 420.5}
|
| 5 |
+
{"agent_name": "0.9.0", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 21.5, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.230917", "tags": ["multi-swe-bench"], "total_cost": 20.75, "total_runtime": 407.5}
|
mock_results/1.0.0-dev1/swe-bench-multimodal.jsonl
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
-
{"agent_name": "
|
| 2 |
-
{"agent_name": "
|
| 3 |
-
{"agent_name": "
|
| 4 |
-
{"agent_name": "
|
| 5 |
-
{"agent_name": "
|
|
|
|
| 1 |
+
{"agent_name": "1.0.2", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 42.1, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.231057", "tags": ["swe-bench-multimodal"], "total_cost": 31.05, "total_runtime": 510.5}
|
| 2 |
+
{"agent_name": "1.0.1", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 39.5, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.231073", "tags": ["swe-bench-multimodal"], "total_cost": 29.75, "total_runtime": 497.5}
|
| 3 |
+
{"agent_name": "1.0.0", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 34.2, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.231082", "tags": ["swe-bench-multimodal"], "total_cost": 27.1, "total_runtime": 471.0}
|
| 4 |
+
{"agent_name": "0.9.5", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 28.9, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.231088", "tags": ["swe-bench-multimodal"], "total_cost": 24.45, "total_runtime": 444.5}
|
| 5 |
+
{"agent_name": "0.9.0", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 25.7, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.231093", "tags": ["swe-bench-multimodal"], "total_cost": 22.85, "total_runtime": 428.5}
|
mock_results/1.0.0-dev1/swe-bench.jsonl
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
-
{"agent_name": "
|
| 2 |
-
{"agent_name": "
|
| 3 |
-
{"agent_name": "
|
| 4 |
-
{"agent_name": "
|
| 5 |
-
{"agent_name": "
|
|
|
|
| 1 |
+
{"agent_name": "1.0.2", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 48.3, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.230638", "tags": ["swe-bench"], "total_cost": 34.15, "total_runtime": 541.5}
|
| 2 |
+
{"agent_name": "1.0.1", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 45.1, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.230668", "tags": ["swe-bench"], "total_cost": 32.55, "total_runtime": 525.5}
|
| 3 |
+
{"agent_name": "1.0.0", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 38.7, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.230681", "tags": ["swe-bench"], "total_cost": 29.35, "total_runtime": 493.5}
|
| 4 |
+
{"agent_name": "0.9.5", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 32.5, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.230689", "tags": ["swe-bench"], "total_cost": 26.25, "total_runtime": 462.5}
|
| 5 |
+
{"agent_name": "0.9.0", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 29.8, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.230696", "tags": ["swe-bench"], "total_cost": 24.9, "total_runtime": 449.0}
|
mock_results/1.0.0-dev1/swt-bench.jsonl
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
-
{"agent_name": "
|
| 2 |
-
{"agent_name": "
|
| 3 |
-
{"agent_name": "
|
| 4 |
-
{"agent_name": "
|
| 5 |
-
{"agent_name": "
|
|
|
|
| 1 |
+
{"agent_name": "1.0.2", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 65.4, "metric": "success_rate", "submission_time": "2025-11-24T23:38:02.231319", "tags": ["swt-bench"], "total_cost": 42.7, "total_runtime": 627.0}
|
| 2 |
+
{"agent_name": "1.0.1", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 62.3, "metric": "success_rate", "submission_time": "2025-11-24T23:38:02.231344", "tags": ["swt-bench"], "total_cost": 41.15, "total_runtime": 611.5}
|
| 3 |
+
{"agent_name": "1.0.0", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 54.1, "metric": "success_rate", "submission_time": "2025-11-24T23:38:02.231356", "tags": ["swt-bench"], "total_cost": 37.05, "total_runtime": 570.5}
|
| 4 |
+
{"agent_name": "0.9.5", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 47.8, "metric": "success_rate", "submission_time": "2025-11-24T23:38:02.231365", "tags": ["swt-bench"], "total_cost": 33.9, "total_runtime": 539.0}
|
| 5 |
+
{"agent_name": "0.9.0", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 44.2, "metric": "success_rate", "submission_time": "2025-11-24T23:38:02.231373", "tags": ["swt-bench"], "total_cost": 32.1, "total_runtime": 521.0}
|
simple_data_loader.py
CHANGED
|
@@ -92,14 +92,13 @@ class SimpleLeaderboardViewer:
|
|
| 92 |
first_record = agent_records.iloc[0]
|
| 93 |
record = {
|
| 94 |
# Core agent info - use final display names
|
| 95 |
-
'agent': agent_name, # Will become "Agent" after prettifying
|
| 96 |
-
'models used': first_record['llm_base'], # Will become "
|
| 97 |
'openness': first_record['openness'], # Will become "Openness"
|
| 98 |
'agent tooling': first_record['tool_usage'], # Will become "Agent Tooling"
|
| 99 |
'date': first_record['submission_time'], # Will become "Date"
|
| 100 |
# Additional columns expected by the transformer
|
| 101 |
'id': first_record.get('id', agent_name), # Will become "Id"
|
| 102 |
-
'submitter': first_record.get('submitter', 'Unknown'), # Will become "Submitter"
|
| 103 |
'source': first_record.get('source', ''), # Will become "Source"
|
| 104 |
'logs': first_record.get('logs', ''), # Will become "Logs"
|
| 105 |
}
|
|
|
|
| 92 |
first_record = agent_records.iloc[0]
|
| 93 |
record = {
|
| 94 |
# Core agent info - use final display names
|
| 95 |
+
'agent': agent_name, # Will become "Agent Version" after prettifying
|
| 96 |
+
'models used': first_record['llm_base'], # Will become "Model"
|
| 97 |
'openness': first_record['openness'], # Will become "Openness"
|
| 98 |
'agent tooling': first_record['tool_usage'], # Will become "Agent Tooling"
|
| 99 |
'date': first_record['submission_time'], # Will become "Date"
|
| 100 |
# Additional columns expected by the transformer
|
| 101 |
'id': first_record.get('id', agent_name), # Will become "Id"
|
|
|
|
| 102 |
'source': first_record.get('source', ''), # Will become "Source"
|
| 103 |
'logs': first_record.get('logs', ''), # Will become "Logs"
|
| 104 |
}
|