Spaces:
Running
Running
openhands
commited on
Commit
·
0718569
1
Parent(s):
6a0d1cb
Fix Categories Attempted calculation to handle missing category columns correctly
Browse files- Changed logic to check if category score is not None and not 0.0
- This fixes the issue where missing categories were being counted as attempted
- Now correctly shows X/5 based on actual categories with data
- data/extracted/1.0.0-dev1/results/20251124_claude_3_5_sonnet_20241022/metadata.json +0 -9
- data/extracted/1.0.0-dev1/results/20251124_claude_3_5_sonnet_20241022/scores.json +0 -62
- data/extracted/1.0.0-dev1/results/20251124_claude_3_opus_20240229/metadata.json +0 -9
- data/extracted/1.0.0-dev1/results/20251124_claude_3_opus_20240229/scores.json +0 -62
- data/extracted/1.0.0-dev1/results/20251124_gpt_4_turbo_2024_04_09/metadata.json +0 -9
- data/extracted/1.0.0-dev1/results/20251124_gpt_4_turbo_2024_04_09/scores.json +0 -62
- data/extracted/1.0.0-dev1/results/20251124_gpt_4o_2024_11_20/metadata.json +0 -9
- data/extracted/1.0.0-dev1/results/20251124_gpt_4o_2024_11_20/scores.json +0 -62
- data/extracted/1.0.0-dev1/results/20251124_gpt_4o_mini_2024_07_18/metadata.json +0 -9
- data/extracted/1.0.0-dev1/results/20251124_gpt_4o_mini_2024_07_18/scores.json +0 -62
- data/extracted/agenteval.json +0 -74
- data/extracted/commit0.jsonl +0 -5
- data/extracted/gaia.jsonl +0 -5
- data/extracted/multi-swe-bench.jsonl +0 -5
- data/extracted/swe-bench-multimodal.jsonl +0 -5
- data/extracted/swe-bench.jsonl +0 -5
- data/extracted/swt-bench.jsonl +0 -5
- data/extracted/test.jsonl +0 -30
- data/extracted/test.parquet +0 -0
- data/extracted/validation.jsonl +0 -30
- data/extracted/validation.parquet +0 -0
- leaderboard_transformer.py +1 -1
data/extracted/1.0.0-dev1/results/20251124_claude_3_5_sonnet_20241022/metadata.json
DELETED
|
@@ -1,9 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"agent_name": "OpenHands CodeAct v2.1",
|
| 3 |
-
"agent_version": "OpenHands CodeAct v2.1",
|
| 4 |
-
"model": "claude-3-5-sonnet-20241022",
|
| 5 |
-
"openness": "closed_api_available",
|
| 6 |
-
"tool_usage": "standard",
|
| 7 |
-
"submission_time": "2025-11-24T19:56:00.092865",
|
| 8 |
-
"directory_name": "20251124_claude_3_5_sonnet_20241022"
|
| 9 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data/extracted/1.0.0-dev1/results/20251124_claude_3_5_sonnet_20241022/scores.json
DELETED
|
@@ -1,62 +0,0 @@
|
|
| 1 |
-
[
|
| 2 |
-
{
|
| 3 |
-
"benchmark": "swe-bench",
|
| 4 |
-
"score": 48.3,
|
| 5 |
-
"metric": "resolve_rate",
|
| 6 |
-
"total_cost": 34.15,
|
| 7 |
-
"total_runtime": 541.5,
|
| 8 |
-
"tags": [
|
| 9 |
-
"swe-bench"
|
| 10 |
-
]
|
| 11 |
-
},
|
| 12 |
-
{
|
| 13 |
-
"benchmark": "swe-bench-multimodal",
|
| 14 |
-
"score": 42.1,
|
| 15 |
-
"metric": "resolve_rate",
|
| 16 |
-
"total_cost": 31.05,
|
| 17 |
-
"total_runtime": 510.5,
|
| 18 |
-
"tags": [
|
| 19 |
-
"swe-bench-multimodal"
|
| 20 |
-
]
|
| 21 |
-
},
|
| 22 |
-
{
|
| 23 |
-
"benchmark": "commit0",
|
| 24 |
-
"score": 71.2,
|
| 25 |
-
"metric": "test_pass_rate",
|
| 26 |
-
"total_cost": 45.6,
|
| 27 |
-
"total_runtime": 656.0,
|
| 28 |
-
"tags": [
|
| 29 |
-
"commit0"
|
| 30 |
-
]
|
| 31 |
-
},
|
| 32 |
-
{
|
| 33 |
-
"benchmark": "multi-swe-bench",
|
| 34 |
-
"score": 35.2,
|
| 35 |
-
"metric": "resolve_rate",
|
| 36 |
-
"total_cost": 27.6,
|
| 37 |
-
"total_runtime": 476.0,
|
| 38 |
-
"tags": [
|
| 39 |
-
"multi-swe-bench"
|
| 40 |
-
]
|
| 41 |
-
},
|
| 42 |
-
{
|
| 43 |
-
"benchmark": "swt-bench",
|
| 44 |
-
"score": 65.4,
|
| 45 |
-
"metric": "success_rate",
|
| 46 |
-
"total_cost": 42.7,
|
| 47 |
-
"total_runtime": 627.0,
|
| 48 |
-
"tags": [
|
| 49 |
-
"swt-bench"
|
| 50 |
-
]
|
| 51 |
-
},
|
| 52 |
-
{
|
| 53 |
-
"benchmark": "gaia",
|
| 54 |
-
"score": 58.7,
|
| 55 |
-
"metric": "accuracy",
|
| 56 |
-
"total_cost": 39.35,
|
| 57 |
-
"total_runtime": 593.5,
|
| 58 |
-
"tags": [
|
| 59 |
-
"gaia"
|
| 60 |
-
]
|
| 61 |
-
}
|
| 62 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data/extracted/1.0.0-dev1/results/20251124_claude_3_opus_20240229/metadata.json
DELETED
|
@@ -1,9 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"agent_name": "SWE-Agent",
|
| 3 |
-
"agent_version": "SWE-Agent",
|
| 4 |
-
"model": "claude-3-opus-20240229",
|
| 5 |
-
"openness": "closed_api_available",
|
| 6 |
-
"tool_usage": "custom_interface",
|
| 7 |
-
"submission_time": "2025-11-24T19:56:00.092922",
|
| 8 |
-
"directory_name": "20251124_claude_3_opus_20240229"
|
| 9 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data/extracted/1.0.0-dev1/results/20251124_claude_3_opus_20240229/scores.json
DELETED
|
@@ -1,62 +0,0 @@
|
|
| 1 |
-
[
|
| 2 |
-
{
|
| 3 |
-
"benchmark": "swe-bench",
|
| 4 |
-
"score": 29.8,
|
| 5 |
-
"metric": "resolve_rate",
|
| 6 |
-
"total_cost": 24.9,
|
| 7 |
-
"total_runtime": 449.0,
|
| 8 |
-
"tags": [
|
| 9 |
-
"swe-bench"
|
| 10 |
-
]
|
| 11 |
-
},
|
| 12 |
-
{
|
| 13 |
-
"benchmark": "swe-bench-multimodal",
|
| 14 |
-
"score": 25.7,
|
| 15 |
-
"metric": "resolve_rate",
|
| 16 |
-
"total_cost": 22.85,
|
| 17 |
-
"total_runtime": 428.5,
|
| 18 |
-
"tags": [
|
| 19 |
-
"swe-bench-multimodal"
|
| 20 |
-
]
|
| 21 |
-
},
|
| 22 |
-
{
|
| 23 |
-
"benchmark": "commit0",
|
| 24 |
-
"score": 52.1,
|
| 25 |
-
"metric": "test_pass_rate",
|
| 26 |
-
"total_cost": 36.05,
|
| 27 |
-
"total_runtime": 560.5,
|
| 28 |
-
"tags": [
|
| 29 |
-
"commit0"
|
| 30 |
-
]
|
| 31 |
-
},
|
| 32 |
-
{
|
| 33 |
-
"benchmark": "multi-swe-bench",
|
| 34 |
-
"score": 21.5,
|
| 35 |
-
"metric": "resolve_rate",
|
| 36 |
-
"total_cost": 20.75,
|
| 37 |
-
"total_runtime": 407.5,
|
| 38 |
-
"tags": [
|
| 39 |
-
"multi-swe-bench"
|
| 40 |
-
]
|
| 41 |
-
},
|
| 42 |
-
{
|
| 43 |
-
"benchmark": "swt-bench",
|
| 44 |
-
"score": 44.2,
|
| 45 |
-
"metric": "success_rate",
|
| 46 |
-
"total_cost": 32.1,
|
| 47 |
-
"total_runtime": 521.0,
|
| 48 |
-
"tags": [
|
| 49 |
-
"swt-bench"
|
| 50 |
-
]
|
| 51 |
-
},
|
| 52 |
-
{
|
| 53 |
-
"benchmark": "gaia",
|
| 54 |
-
"score": 39.4,
|
| 55 |
-
"metric": "accuracy",
|
| 56 |
-
"total_cost": 29.7,
|
| 57 |
-
"total_runtime": 497.0,
|
| 58 |
-
"tags": [
|
| 59 |
-
"gaia"
|
| 60 |
-
]
|
| 61 |
-
}
|
| 62 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data/extracted/1.0.0-dev1/results/20251124_gpt_4_turbo_2024_04_09/metadata.json
DELETED
|
@@ -1,9 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"agent_name": "AutoCodeRover",
|
| 3 |
-
"agent_version": "AutoCodeRover",
|
| 4 |
-
"model": "gpt-4-turbo-2024-04-09",
|
| 5 |
-
"openness": "closed_api_available",
|
| 6 |
-
"tool_usage": "standard",
|
| 7 |
-
"submission_time": "2025-11-24T19:56:00.092908",
|
| 8 |
-
"directory_name": "20251124_gpt_4_turbo_2024_04_09"
|
| 9 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data/extracted/1.0.0-dev1/results/20251124_gpt_4_turbo_2024_04_09/scores.json
DELETED
|
@@ -1,62 +0,0 @@
|
|
| 1 |
-
[
|
| 2 |
-
{
|
| 3 |
-
"benchmark": "swe-bench",
|
| 4 |
-
"score": 38.7,
|
| 5 |
-
"metric": "resolve_rate",
|
| 6 |
-
"total_cost": 29.35,
|
| 7 |
-
"total_runtime": 493.5,
|
| 8 |
-
"tags": [
|
| 9 |
-
"swe-bench"
|
| 10 |
-
]
|
| 11 |
-
},
|
| 12 |
-
{
|
| 13 |
-
"benchmark": "swe-bench-multimodal",
|
| 14 |
-
"score": 34.2,
|
| 15 |
-
"metric": "resolve_rate",
|
| 16 |
-
"total_cost": 27.1,
|
| 17 |
-
"total_runtime": 471.0,
|
| 18 |
-
"tags": [
|
| 19 |
-
"swe-bench-multimodal"
|
| 20 |
-
]
|
| 21 |
-
},
|
| 22 |
-
{
|
| 23 |
-
"benchmark": "commit0",
|
| 24 |
-
"score": 61.5,
|
| 25 |
-
"metric": "test_pass_rate",
|
| 26 |
-
"total_cost": 40.75,
|
| 27 |
-
"total_runtime": 607.5,
|
| 28 |
-
"tags": [
|
| 29 |
-
"commit0"
|
| 30 |
-
]
|
| 31 |
-
},
|
| 32 |
-
{
|
| 33 |
-
"benchmark": "multi-swe-bench",
|
| 34 |
-
"score": 28.4,
|
| 35 |
-
"metric": "resolve_rate",
|
| 36 |
-
"total_cost": 24.2,
|
| 37 |
-
"total_runtime": 442.0,
|
| 38 |
-
"tags": [
|
| 39 |
-
"multi-swe-bench"
|
| 40 |
-
]
|
| 41 |
-
},
|
| 42 |
-
{
|
| 43 |
-
"benchmark": "swt-bench",
|
| 44 |
-
"score": 54.1,
|
| 45 |
-
"metric": "success_rate",
|
| 46 |
-
"total_cost": 37.05,
|
| 47 |
-
"total_runtime": 570.5,
|
| 48 |
-
"tags": [
|
| 49 |
-
"swt-bench"
|
| 50 |
-
]
|
| 51 |
-
},
|
| 52 |
-
{
|
| 53 |
-
"benchmark": "gaia",
|
| 54 |
-
"score": 48.3,
|
| 55 |
-
"metric": "accuracy",
|
| 56 |
-
"total_cost": 34.15,
|
| 57 |
-
"total_runtime": 541.5,
|
| 58 |
-
"tags": [
|
| 59 |
-
"gaia"
|
| 60 |
-
]
|
| 61 |
-
}
|
| 62 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data/extracted/1.0.0-dev1/results/20251124_gpt_4o_2024_11_20/metadata.json
DELETED
|
@@ -1,9 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"agent_name": "OpenHands CodeAct v2.0",
|
| 3 |
-
"agent_version": "OpenHands CodeAct v2.0",
|
| 4 |
-
"model": "gpt-4o-2024-11-20",
|
| 5 |
-
"openness": "closed_api_available",
|
| 6 |
-
"tool_usage": "standard",
|
| 7 |
-
"submission_time": "2025-11-24T19:56:00.092895",
|
| 8 |
-
"directory_name": "20251124_gpt_4o_2024_11_20"
|
| 9 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data/extracted/1.0.0-dev1/results/20251124_gpt_4o_2024_11_20/scores.json
DELETED
|
@@ -1,62 +0,0 @@
|
|
| 1 |
-
[
|
| 2 |
-
{
|
| 3 |
-
"benchmark": "swe-bench",
|
| 4 |
-
"score": 45.1,
|
| 5 |
-
"metric": "resolve_rate",
|
| 6 |
-
"total_cost": 32.55,
|
| 7 |
-
"total_runtime": 525.5,
|
| 8 |
-
"tags": [
|
| 9 |
-
"swe-bench"
|
| 10 |
-
]
|
| 11 |
-
},
|
| 12 |
-
{
|
| 13 |
-
"benchmark": "swe-bench-multimodal",
|
| 14 |
-
"score": 39.5,
|
| 15 |
-
"metric": "resolve_rate",
|
| 16 |
-
"total_cost": 29.75,
|
| 17 |
-
"total_runtime": 497.5,
|
| 18 |
-
"tags": [
|
| 19 |
-
"swe-bench-multimodal"
|
| 20 |
-
]
|
| 21 |
-
},
|
| 22 |
-
{
|
| 23 |
-
"benchmark": "commit0",
|
| 24 |
-
"score": 68.9,
|
| 25 |
-
"metric": "test_pass_rate",
|
| 26 |
-
"total_cost": 44.45,
|
| 27 |
-
"total_runtime": 644.5,
|
| 28 |
-
"tags": [
|
| 29 |
-
"commit0"
|
| 30 |
-
]
|
| 31 |
-
},
|
| 32 |
-
{
|
| 33 |
-
"benchmark": "multi-swe-bench",
|
| 34 |
-
"score": 32.8,
|
| 35 |
-
"metric": "resolve_rate",
|
| 36 |
-
"total_cost": 26.4,
|
| 37 |
-
"total_runtime": 464.0,
|
| 38 |
-
"tags": [
|
| 39 |
-
"multi-swe-bench"
|
| 40 |
-
]
|
| 41 |
-
},
|
| 42 |
-
{
|
| 43 |
-
"benchmark": "swt-bench",
|
| 44 |
-
"score": 62.3,
|
| 45 |
-
"metric": "success_rate",
|
| 46 |
-
"total_cost": 41.15,
|
| 47 |
-
"total_runtime": 611.5,
|
| 48 |
-
"tags": [
|
| 49 |
-
"swt-bench"
|
| 50 |
-
]
|
| 51 |
-
},
|
| 52 |
-
{
|
| 53 |
-
"benchmark": "gaia",
|
| 54 |
-
"score": 55.2,
|
| 55 |
-
"metric": "accuracy",
|
| 56 |
-
"total_cost": 37.6,
|
| 57 |
-
"total_runtime": 576.0,
|
| 58 |
-
"tags": [
|
| 59 |
-
"gaia"
|
| 60 |
-
]
|
| 61 |
-
}
|
| 62 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data/extracted/1.0.0-dev1/results/20251124_gpt_4o_mini_2024_07_18/metadata.json
DELETED
|
@@ -1,9 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"agent_name": "Agentless",
|
| 3 |
-
"agent_version": "Agentless",
|
| 4 |
-
"model": "gpt-4o-mini-2024-07-18",
|
| 5 |
-
"openness": "closed_api_available",
|
| 6 |
-
"tool_usage": "standard",
|
| 7 |
-
"submission_time": "2025-11-24T19:56:00.092916",
|
| 8 |
-
"directory_name": "20251124_gpt_4o_mini_2024_07_18"
|
| 9 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data/extracted/1.0.0-dev1/results/20251124_gpt_4o_mini_2024_07_18/scores.json
DELETED
|
@@ -1,62 +0,0 @@
|
|
| 1 |
-
[
|
| 2 |
-
{
|
| 3 |
-
"benchmark": "swe-bench",
|
| 4 |
-
"score": 32.5,
|
| 5 |
-
"metric": "resolve_rate",
|
| 6 |
-
"total_cost": 26.25,
|
| 7 |
-
"total_runtime": 462.5,
|
| 8 |
-
"tags": [
|
| 9 |
-
"swe-bench"
|
| 10 |
-
]
|
| 11 |
-
},
|
| 12 |
-
{
|
| 13 |
-
"benchmark": "swe-bench-multimodal",
|
| 14 |
-
"score": 28.9,
|
| 15 |
-
"metric": "resolve_rate",
|
| 16 |
-
"total_cost": 24.45,
|
| 17 |
-
"total_runtime": 444.5,
|
| 18 |
-
"tags": [
|
| 19 |
-
"swe-bench-multimodal"
|
| 20 |
-
]
|
| 21 |
-
},
|
| 22 |
-
{
|
| 23 |
-
"benchmark": "commit0",
|
| 24 |
-
"score": 55.3,
|
| 25 |
-
"metric": "test_pass_rate",
|
| 26 |
-
"total_cost": 37.65,
|
| 27 |
-
"total_runtime": 576.5,
|
| 28 |
-
"tags": [
|
| 29 |
-
"commit0"
|
| 30 |
-
]
|
| 31 |
-
},
|
| 32 |
-
{
|
| 33 |
-
"benchmark": "multi-swe-bench",
|
| 34 |
-
"score": 24.1,
|
| 35 |
-
"metric": "resolve_rate",
|
| 36 |
-
"total_cost": 22.05,
|
| 37 |
-
"total_runtime": 420.5,
|
| 38 |
-
"tags": [
|
| 39 |
-
"multi-swe-bench"
|
| 40 |
-
]
|
| 41 |
-
},
|
| 42 |
-
{
|
| 43 |
-
"benchmark": "swt-bench",
|
| 44 |
-
"score": 47.8,
|
| 45 |
-
"metric": "success_rate",
|
| 46 |
-
"total_cost": 33.9,
|
| 47 |
-
"total_runtime": 539.0,
|
| 48 |
-
"tags": [
|
| 49 |
-
"swt-bench"
|
| 50 |
-
]
|
| 51 |
-
},
|
| 52 |
-
{
|
| 53 |
-
"benchmark": "gaia",
|
| 54 |
-
"score": 42.1,
|
| 55 |
-
"metric": "accuracy",
|
| 56 |
-
"total_cost": 31.05,
|
| 57 |
-
"total_runtime": 510.5,
|
| 58 |
-
"tags": [
|
| 59 |
-
"gaia"
|
| 60 |
-
]
|
| 61 |
-
}
|
| 62 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data/extracted/agenteval.json
DELETED
|
@@ -1,74 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"suite_config": {
|
| 3 |
-
"name": "openhands-index",
|
| 4 |
-
"version": "1.0.0-dev1",
|
| 5 |
-
"splits": [
|
| 6 |
-
{
|
| 7 |
-
"name": "swe-bench",
|
| 8 |
-
"tasks": [
|
| 9 |
-
{
|
| 10 |
-
"name": "swe-bench",
|
| 11 |
-
"tags": [
|
| 12 |
-
"swe-bench"
|
| 13 |
-
]
|
| 14 |
-
}
|
| 15 |
-
]
|
| 16 |
-
},
|
| 17 |
-
{
|
| 18 |
-
"name": "multi-swe-bench",
|
| 19 |
-
"tasks": [
|
| 20 |
-
{
|
| 21 |
-
"name": "multi-swe-bench",
|
| 22 |
-
"tags": [
|
| 23 |
-
"multi-swe-bench"
|
| 24 |
-
]
|
| 25 |
-
}
|
| 26 |
-
]
|
| 27 |
-
},
|
| 28 |
-
{
|
| 29 |
-
"name": "swe-bench-multimodal",
|
| 30 |
-
"tasks": [
|
| 31 |
-
{
|
| 32 |
-
"name": "swe-bench-multimodal",
|
| 33 |
-
"tags": [
|
| 34 |
-
"swe-bench-multimodal"
|
| 35 |
-
]
|
| 36 |
-
}
|
| 37 |
-
]
|
| 38 |
-
},
|
| 39 |
-
{
|
| 40 |
-
"name": "swt-bench",
|
| 41 |
-
"tasks": [
|
| 42 |
-
{
|
| 43 |
-
"name": "swt-bench",
|
| 44 |
-
"tags": [
|
| 45 |
-
"swt-bench"
|
| 46 |
-
]
|
| 47 |
-
}
|
| 48 |
-
]
|
| 49 |
-
},
|
| 50 |
-
{
|
| 51 |
-
"name": "commit0",
|
| 52 |
-
"tasks": [
|
| 53 |
-
{
|
| 54 |
-
"name": "commit0",
|
| 55 |
-
"tags": [
|
| 56 |
-
"commit0"
|
| 57 |
-
]
|
| 58 |
-
}
|
| 59 |
-
]
|
| 60 |
-
},
|
| 61 |
-
{
|
| 62 |
-
"name": "gaia",
|
| 63 |
-
"tasks": [
|
| 64 |
-
{
|
| 65 |
-
"name": "gaia",
|
| 66 |
-
"tags": [
|
| 67 |
-
"gaia"
|
| 68 |
-
]
|
| 69 |
-
}
|
| 70 |
-
]
|
| 71 |
-
}
|
| 72 |
-
]
|
| 73 |
-
}
|
| 74 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data/extracted/commit0.jsonl
DELETED
|
@@ -1,5 +0,0 @@
|
|
| 1 |
-
{"agent_name": "1.0.2", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 71.2, "metric": "test_pass_rate", "submission_time": "2025-11-24T23:38:02.231539", "tags": ["commit0"], "total_cost": 45.6, "total_runtime": 656.0}
|
| 2 |
-
{"agent_name": "1.0.1", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 68.9, "metric": "test_pass_rate", "submission_time": "2025-11-24T23:38:02.231556", "tags": ["commit0"], "total_cost": 44.45, "total_runtime": 644.5}
|
| 3 |
-
{"agent_name": "1.0.0", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 61.5, "metric": "test_pass_rate", "submission_time": "2025-11-24T23:38:02.231563", "tags": ["commit0"], "total_cost": 40.75, "total_runtime": 607.5}
|
| 4 |
-
{"agent_name": "0.9.5", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 55.3, "metric": "test_pass_rate", "submission_time": "2025-11-24T23:38:02.231569", "tags": ["commit0"], "total_cost": 37.65, "total_runtime": 576.5}
|
| 5 |
-
{"agent_name": "0.9.0", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 52.1, "metric": "test_pass_rate", "submission_time": "2025-11-24T23:38:02.231574", "tags": ["commit0"], "total_cost": 36.05, "total_runtime": 560.5}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data/extracted/gaia.jsonl
DELETED
|
@@ -1,5 +0,0 @@
|
|
| 1 |
-
{"agent_name": "1.0.2", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 58.7, "metric": "accuracy", "submission_time": "2025-11-24T23:38:02.231711", "tags": ["gaia"], "total_cost": 39.35, "total_runtime": 593.5}
|
| 2 |
-
{"agent_name": "1.0.1", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 55.2, "metric": "accuracy", "submission_time": "2025-11-24T23:38:02.231728", "tags": ["gaia"], "total_cost": 37.6, "total_runtime": 576.0}
|
| 3 |
-
{"agent_name": "1.0.0", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 48.3, "metric": "accuracy", "submission_time": "2025-11-24T23:38:02.231735", "tags": ["gaia"], "total_cost": 34.15, "total_runtime": 541.5}
|
| 4 |
-
{"agent_name": "0.9.5", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 42.1, "metric": "accuracy", "submission_time": "2025-11-24T23:38:02.231741", "tags": ["gaia"], "total_cost": 31.05, "total_runtime": 510.5}
|
| 5 |
-
{"agent_name": "0.9.0", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 39.4, "metric": "accuracy", "submission_time": "2025-11-24T23:38:02.231749", "tags": ["gaia"], "total_cost": 29.7, "total_runtime": 497.0}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data/extracted/multi-swe-bench.jsonl
DELETED
|
@@ -1,5 +0,0 @@
|
|
| 1 |
-
{"agent_name": "1.0.2", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 35.2, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.230869", "tags": ["multi-swe-bench"], "total_cost": 27.6, "total_runtime": 476.0}
|
| 2 |
-
{"agent_name": "1.0.1", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 32.8, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.230889", "tags": ["multi-swe-bench"], "total_cost": 26.4, "total_runtime": 464.0}
|
| 3 |
-
{"agent_name": "1.0.0", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 28.4, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.230899", "tags": ["multi-swe-bench"], "total_cost": 24.2, "total_runtime": 442.0}
|
| 4 |
-
{"agent_name": "0.9.5", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 24.1, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.230908", "tags": ["multi-swe-bench"], "total_cost": 22.05, "total_runtime": 420.5}
|
| 5 |
-
{"agent_name": "0.9.0", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 21.5, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.230917", "tags": ["multi-swe-bench"], "total_cost": 20.75, "total_runtime": 407.5}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data/extracted/swe-bench-multimodal.jsonl
DELETED
|
@@ -1,5 +0,0 @@
|
|
| 1 |
-
{"agent_name": "1.0.2", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 42.1, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.231057", "tags": ["swe-bench-multimodal"], "total_cost": 31.05, "total_runtime": 510.5}
|
| 2 |
-
{"agent_name": "1.0.1", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 39.5, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.231073", "tags": ["swe-bench-multimodal"], "total_cost": 29.75, "total_runtime": 497.5}
|
| 3 |
-
{"agent_name": "1.0.0", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 34.2, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.231082", "tags": ["swe-bench-multimodal"], "total_cost": 27.1, "total_runtime": 471.0}
|
| 4 |
-
{"agent_name": "0.9.5", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 28.9, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.231088", "tags": ["swe-bench-multimodal"], "total_cost": 24.45, "total_runtime": 444.5}
|
| 5 |
-
{"agent_name": "0.9.0", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 25.7, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.231093", "tags": ["swe-bench-multimodal"], "total_cost": 22.85, "total_runtime": 428.5}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data/extracted/swe-bench.jsonl
DELETED
|
@@ -1,5 +0,0 @@
|
|
| 1 |
-
{"agent_name": "1.0.2", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 48.3, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.230638", "tags": ["swe-bench"], "total_cost": 34.15, "total_runtime": 541.5}
|
| 2 |
-
{"agent_name": "1.0.1", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 45.1, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.230668", "tags": ["swe-bench"], "total_cost": 32.55, "total_runtime": 525.5}
|
| 3 |
-
{"agent_name": "1.0.0", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 38.7, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.230681", "tags": ["swe-bench"], "total_cost": 29.35, "total_runtime": 493.5}
|
| 4 |
-
{"agent_name": "0.9.5", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 32.5, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.230689", "tags": ["swe-bench"], "total_cost": 26.25, "total_runtime": 462.5}
|
| 5 |
-
{"agent_name": "0.9.0", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 29.8, "metric": "resolve_rate", "submission_time": "2025-11-24T23:38:02.230696", "tags": ["swe-bench"], "total_cost": 24.9, "total_runtime": 449.0}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data/extracted/swt-bench.jsonl
DELETED
|
@@ -1,5 +0,0 @@
|
|
| 1 |
-
{"agent_name": "1.0.2", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 65.4, "metric": "success_rate", "submission_time": "2025-11-24T23:38:02.231319", "tags": ["swt-bench"], "total_cost": 42.7, "total_runtime": 627.0}
|
| 2 |
-
{"agent_name": "1.0.1", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 62.3, "metric": "success_rate", "submission_time": "2025-11-24T23:38:02.231344", "tags": ["swt-bench"], "total_cost": 41.15, "total_runtime": 611.5}
|
| 3 |
-
{"agent_name": "1.0.0", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 54.1, "metric": "success_rate", "submission_time": "2025-11-24T23:38:02.231356", "tags": ["swt-bench"], "total_cost": 37.05, "total_runtime": 570.5}
|
| 4 |
-
{"agent_name": "0.9.5", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 47.8, "metric": "success_rate", "submission_time": "2025-11-24T23:38:02.231365", "tags": ["swt-bench"], "total_cost": 33.9, "total_runtime": 539.0}
|
| 5 |
-
{"agent_name": "0.9.0", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 44.2, "metric": "success_rate", "submission_time": "2025-11-24T23:38:02.231373", "tags": ["swt-bench"], "total_cost": 32.1, "total_runtime": 521.0}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data/extracted/test.jsonl
DELETED
|
@@ -1,30 +0,0 @@
|
|
| 1 |
-
{"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 48.3, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.092865", "tags": ["swe-bench"], "total_cost": 34.15, "total_runtime": 541.5}
|
| 2 |
-
{"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 45.1, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.092895", "tags": ["swe-bench"], "total_cost": 32.55, "total_runtime": 525.5}
|
| 3 |
-
{"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 38.7, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.092908", "tags": ["swe-bench"], "total_cost": 29.35, "total_runtime": 493.5}
|
| 4 |
-
{"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 32.5, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.092916", "tags": ["swe-bench"], "total_cost": 26.25, "total_runtime": 462.5}
|
| 5 |
-
{"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 29.8, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.092922", "tags": ["swe-bench"], "total_cost": 24.9, "total_runtime": 449.0}
|
| 6 |
-
{"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 35.2, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093026", "tags": ["multi-swe-bench"], "total_cost": 27.6, "total_runtime": 476.0}
|
| 7 |
-
{"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 32.8, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093040", "tags": ["multi-swe-bench"], "total_cost": 26.4, "total_runtime": 464.0}
|
| 8 |
-
{"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 28.4, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093048", "tags": ["multi-swe-bench"], "total_cost": 24.2, "total_runtime": 442.0}
|
| 9 |
-
{"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 24.1, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093058", "tags": ["multi-swe-bench"], "total_cost": 22.05, "total_runtime": 420.5}
|
| 10 |
-
{"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 21.5, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093067", "tags": ["multi-swe-bench"], "total_cost": 20.75, "total_runtime": 407.5}
|
| 11 |
-
{"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 42.1, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093172", "tags": ["swe-bench-multimodal"], "total_cost": 31.05, "total_runtime": 510.5}
|
| 12 |
-
{"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 39.5, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093187", "tags": ["swe-bench-multimodal"], "total_cost": 29.75, "total_runtime": 497.5}
|
| 13 |
-
{"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 34.2, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093200", "tags": ["swe-bench-multimodal"], "total_cost": 27.1, "total_runtime": 471.0}
|
| 14 |
-
{"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 28.9, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093208", "tags": ["swe-bench-multimodal"], "total_cost": 24.45, "total_runtime": 444.5}
|
| 15 |
-
{"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 25.7, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093218", "tags": ["swe-bench-multimodal"], "total_cost": 22.85, "total_runtime": 428.5}
|
| 16 |
-
{"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 65.4, "metric": "success_rate", "submission_time": "2025-11-24T19:56:00.093304", "tags": ["swt-bench"], "total_cost": 42.7, "total_runtime": 627.0}
|
| 17 |
-
{"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 62.3, "metric": "success_rate", "submission_time": "2025-11-24T19:56:00.093317", "tags": ["swt-bench"], "total_cost": 41.15, "total_runtime": 611.5}
|
| 18 |
-
{"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 54.1, "metric": "success_rate", "submission_time": "2025-11-24T19:56:00.093326", "tags": ["swt-bench"], "total_cost": 37.05, "total_runtime": 570.5}
|
| 19 |
-
{"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 47.8, "metric": "success_rate", "submission_time": "2025-11-24T19:56:00.093333", "tags": ["swt-bench"], "total_cost": 33.9, "total_runtime": 539.0}
|
| 20 |
-
{"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 44.2, "metric": "success_rate", "submission_time": "2025-11-24T19:56:00.093338", "tags": ["swt-bench"], "total_cost": 32.1, "total_runtime": 521.0}
|
| 21 |
-
{"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 71.2, "metric": "test_pass_rate", "submission_time": "2025-11-24T19:56:00.093416", "tags": ["commit0"], "total_cost": 45.6, "total_runtime": 656.0}
|
| 22 |
-
{"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 68.9, "metric": "test_pass_rate", "submission_time": "2025-11-24T19:56:00.093428", "tags": ["commit0"], "total_cost": 44.45, "total_runtime": 644.5}
|
| 23 |
-
{"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 61.5, "metric": "test_pass_rate", "submission_time": "2025-11-24T19:56:00.093436", "tags": ["commit0"], "total_cost": 40.75, "total_runtime": 607.5}
|
| 24 |
-
{"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 55.3, "metric": "test_pass_rate", "submission_time": "2025-11-24T19:56:00.093445", "tags": ["commit0"], "total_cost": 37.65, "total_runtime": 576.5}
|
| 25 |
-
{"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 52.1, "metric": "test_pass_rate", "submission_time": "2025-11-24T19:56:00.093450", "tags": ["commit0"], "total_cost": 36.05, "total_runtime": 560.5}
|
| 26 |
-
{"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 58.7, "metric": "accuracy", "submission_time": "2025-11-24T19:56:00.093541", "tags": ["gaia"], "total_cost": 39.35, "total_runtime": 593.5}
|
| 27 |
-
{"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 55.2, "metric": "accuracy", "submission_time": "2025-11-24T19:56:00.093556", "tags": ["gaia"], "total_cost": 37.6, "total_runtime": 576.0}
|
| 28 |
-
{"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 48.3, "metric": "accuracy", "submission_time": "2025-11-24T19:56:00.093563", "tags": ["gaia"], "total_cost": 34.15, "total_runtime": 541.5}
|
| 29 |
-
{"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 42.1, "metric": "accuracy", "submission_time": "2025-11-24T19:56:00.093568", "tags": ["gaia"], "total_cost": 31.05, "total_runtime": 510.5}
|
| 30 |
-
{"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 39.4, "metric": "accuracy", "submission_time": "2025-11-24T19:56:00.093574", "tags": ["gaia"], "total_cost": 29.7, "total_runtime": 497.0}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data/extracted/test.parquet
DELETED
|
Binary file (9.26 kB)
|
|
|
data/extracted/validation.jsonl
DELETED
|
@@ -1,30 +0,0 @@
|
|
| 1 |
-
{"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 48.3, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.092865", "tags": ["swe-bench"], "total_cost": 34.15, "total_runtime": 541.5}
|
| 2 |
-
{"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 45.1, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.092895", "tags": ["swe-bench"], "total_cost": 32.55, "total_runtime": 525.5}
|
| 3 |
-
{"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 38.7, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.092908", "tags": ["swe-bench"], "total_cost": 29.35, "total_runtime": 493.5}
|
| 4 |
-
{"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 32.5, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.092916", "tags": ["swe-bench"], "total_cost": 26.25, "total_runtime": 462.5}
|
| 5 |
-
{"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 29.8, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.092922", "tags": ["swe-bench"], "total_cost": 24.9, "total_runtime": 449.0}
|
| 6 |
-
{"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 35.2, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093026", "tags": ["multi-swe-bench"], "total_cost": 27.6, "total_runtime": 476.0}
|
| 7 |
-
{"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 32.8, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093040", "tags": ["multi-swe-bench"], "total_cost": 26.4, "total_runtime": 464.0}
|
| 8 |
-
{"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 28.4, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093048", "tags": ["multi-swe-bench"], "total_cost": 24.2, "total_runtime": 442.0}
|
| 9 |
-
{"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 24.1, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093058", "tags": ["multi-swe-bench"], "total_cost": 22.05, "total_runtime": 420.5}
|
| 10 |
-
{"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 21.5, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093067", "tags": ["multi-swe-bench"], "total_cost": 20.75, "total_runtime": 407.5}
|
| 11 |
-
{"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 42.1, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093172", "tags": ["swe-bench-multimodal"], "total_cost": 31.05, "total_runtime": 510.5}
|
| 12 |
-
{"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 39.5, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093187", "tags": ["swe-bench-multimodal"], "total_cost": 29.75, "total_runtime": 497.5}
|
| 13 |
-
{"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 34.2, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093200", "tags": ["swe-bench-multimodal"], "total_cost": 27.1, "total_runtime": 471.0}
|
| 14 |
-
{"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 28.9, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093208", "tags": ["swe-bench-multimodal"], "total_cost": 24.45, "total_runtime": 444.5}
|
| 15 |
-
{"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 25.7, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093218", "tags": ["swe-bench-multimodal"], "total_cost": 22.85, "total_runtime": 428.5}
|
| 16 |
-
{"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 65.4, "metric": "success_rate", "submission_time": "2025-11-24T19:56:00.093304", "tags": ["swt-bench"], "total_cost": 42.7, "total_runtime": 627.0}
|
| 17 |
-
{"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 62.3, "metric": "success_rate", "submission_time": "2025-11-24T19:56:00.093317", "tags": ["swt-bench"], "total_cost": 41.15, "total_runtime": 611.5}
|
| 18 |
-
{"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 54.1, "metric": "success_rate", "submission_time": "2025-11-24T19:56:00.093326", "tags": ["swt-bench"], "total_cost": 37.05, "total_runtime": 570.5}
|
| 19 |
-
{"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 47.8, "metric": "success_rate", "submission_time": "2025-11-24T19:56:00.093333", "tags": ["swt-bench"], "total_cost": 33.9, "total_runtime": 539.0}
|
| 20 |
-
{"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 44.2, "metric": "success_rate", "submission_time": "2025-11-24T19:56:00.093338", "tags": ["swt-bench"], "total_cost": 32.1, "total_runtime": 521.0}
|
| 21 |
-
{"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 71.2, "metric": "test_pass_rate", "submission_time": "2025-11-24T19:56:00.093416", "tags": ["commit0"], "total_cost": 45.6, "total_runtime": 656.0}
|
| 22 |
-
{"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 68.9, "metric": "test_pass_rate", "submission_time": "2025-11-24T19:56:00.093428", "tags": ["commit0"], "total_cost": 44.45, "total_runtime": 644.5}
|
| 23 |
-
{"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 61.5, "metric": "test_pass_rate", "submission_time": "2025-11-24T19:56:00.093436", "tags": ["commit0"], "total_cost": 40.75, "total_runtime": 607.5}
|
| 24 |
-
{"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 55.3, "metric": "test_pass_rate", "submission_time": "2025-11-24T19:56:00.093445", "tags": ["commit0"], "total_cost": 37.65, "total_runtime": 576.5}
|
| 25 |
-
{"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 52.1, "metric": "test_pass_rate", "submission_time": "2025-11-24T19:56:00.093450", "tags": ["commit0"], "total_cost": 36.05, "total_runtime": 560.5}
|
| 26 |
-
{"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 58.7, "metric": "accuracy", "submission_time": "2025-11-24T19:56:00.093541", "tags": ["gaia"], "total_cost": 39.35, "total_runtime": 593.5}
|
| 27 |
-
{"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 55.2, "metric": "accuracy", "submission_time": "2025-11-24T19:56:00.093556", "tags": ["gaia"], "total_cost": 37.6, "total_runtime": 576.0}
|
| 28 |
-
{"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 48.3, "metric": "accuracy", "submission_time": "2025-11-24T19:56:00.093563", "tags": ["gaia"], "total_cost": 34.15, "total_runtime": 541.5}
|
| 29 |
-
{"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 42.1, "metric": "accuracy", "submission_time": "2025-11-24T19:56:00.093568", "tags": ["gaia"], "total_cost": 31.05, "total_runtime": 510.5}
|
| 30 |
-
{"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 39.4, "metric": "accuracy", "submission_time": "2025-11-24T19:56:00.093574", "tags": ["gaia"], "total_cost": 29.7, "total_runtime": 497.0}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data/extracted/validation.parquet
DELETED
|
Binary file (9.29 kB)
|
|
|
leaderboard_transformer.py
CHANGED
|
@@ -279,7 +279,7 @@ class DataTransformer:
|
|
| 279 |
if primary_metric == "Overall":
|
| 280 |
def calculate_attempted(row):
|
| 281 |
main_categories = ['Bug Fixing', 'Frontend Development', 'App Creation', 'Test Generation', 'Information Gathering']
|
| 282 |
-
count = sum(1 for category in main_categories if row.get(f"{category} Score")
|
| 283 |
return f"{count}/5"
|
| 284 |
|
| 285 |
# Apply the function row-wise to create the new column
|
|
|
|
| 279 |
if primary_metric == "Overall":
|
| 280 |
def calculate_attempted(row):
|
| 281 |
main_categories = ['Bug Fixing', 'Frontend Development', 'App Creation', 'Test Generation', 'Information Gathering']
|
| 282 |
+
count = sum(1 for category in main_categories if row.get(f"{category} Score") not in [None, 0.0])
|
| 283 |
return f"{count}/5"
|
| 284 |
|
| 285 |
# Apply the function row-wise to create the new column
|