Spaces:
Running
Running
openhands
openhands
commited on
Commit
·
5e9c3b9
1
Parent(s):
c56f232
Fix category cost calculation - add category-level aggregation
Browse files- Updated simple_data_loader.py to build proper category-to-benchmarks mapping from agenteval.json
- Added logic to calculate category-level aggregate scores and costs from individual benchmark results
- Updated agenteval.json to include category tags (Bug Fixing, App Creation, etc.)
- Fixes issue where Bug Fixing cost was showing as 0.0 instead of the average of swe-bench and swe-bench-multimodal costs
Co-authored-by: openhands <[email protected]>
- data/1.0.0-dev1/agenteval.json +71 -47
- simple_data_loader.py +33 -6
data/1.0.0-dev1/agenteval.json
CHANGED
|
@@ -4,87 +4,111 @@
|
|
| 4 |
"version": "1.0.0-dev1",
|
| 5 |
"splits": [
|
| 6 |
{
|
| 7 |
-
"name": "
|
| 8 |
"tasks": [
|
| 9 |
{
|
| 10 |
"name": "swe-bench",
|
| 11 |
-
"
|
| 12 |
-
|
| 13 |
-
|
|
|
|
|
|
|
| 14 |
},
|
| 15 |
{
|
| 16 |
-
"name": "
|
| 17 |
-
"
|
| 18 |
-
|
| 19 |
-
|
|
|
|
|
|
|
| 20 |
},
|
| 21 |
{
|
| 22 |
-
"name": "
|
| 23 |
-
"
|
| 24 |
-
|
| 25 |
-
|
|
|
|
|
|
|
| 26 |
},
|
| 27 |
{
|
| 28 |
-
"name": "
|
| 29 |
-
"
|
| 30 |
-
|
| 31 |
-
|
|
|
|
|
|
|
| 32 |
},
|
| 33 |
{
|
| 34 |
-
"name": "
|
| 35 |
-
"
|
| 36 |
-
|
| 37 |
-
|
|
|
|
|
|
|
| 38 |
},
|
| 39 |
{
|
| 40 |
"name": "gaia",
|
| 41 |
-
"
|
| 42 |
-
|
| 43 |
-
|
|
|
|
|
|
|
| 44 |
}
|
| 45 |
]
|
| 46 |
},
|
| 47 |
{
|
| 48 |
-
"name": "
|
| 49 |
"tasks": [
|
| 50 |
{
|
| 51 |
"name": "swe-bench",
|
| 52 |
-
"
|
| 53 |
-
|
| 54 |
-
|
|
|
|
|
|
|
| 55 |
},
|
| 56 |
{
|
| 57 |
-
"name": "
|
| 58 |
-
"
|
| 59 |
-
|
| 60 |
-
|
|
|
|
|
|
|
| 61 |
},
|
| 62 |
{
|
| 63 |
-
"name": "
|
| 64 |
-
"
|
| 65 |
-
|
| 66 |
-
|
|
|
|
|
|
|
| 67 |
},
|
| 68 |
{
|
| 69 |
-
"name": "
|
| 70 |
-
"
|
| 71 |
-
|
| 72 |
-
|
|
|
|
|
|
|
| 73 |
},
|
| 74 |
{
|
| 75 |
-
"name": "
|
| 76 |
-
"
|
| 77 |
-
|
| 78 |
-
|
|
|
|
|
|
|
| 79 |
},
|
| 80 |
{
|
| 81 |
"name": "gaia",
|
| 82 |
-
"
|
| 83 |
-
|
| 84 |
-
|
|
|
|
|
|
|
| 85 |
}
|
| 86 |
]
|
| 87 |
}
|
| 88 |
]
|
| 89 |
}
|
| 90 |
-
}
|
|
|
|
| 4 |
"version": "1.0.0-dev1",
|
| 5 |
"splits": [
|
| 6 |
{
|
| 7 |
+
"name": "test",
|
| 8 |
"tasks": [
|
| 9 |
{
|
| 10 |
"name": "swe-bench",
|
| 11 |
+
"tags": [
|
| 12 |
+
"Overall",
|
| 13 |
+
"Bug Fixing",
|
| 14 |
+
"swe-bench"
|
| 15 |
+
]
|
| 16 |
},
|
| 17 |
{
|
| 18 |
+
"name": "swe-bench-multimodal",
|
| 19 |
+
"tags": [
|
| 20 |
+
"Overall",
|
| 21 |
+
"Bug Fixing",
|
| 22 |
+
"swe-bench-multimodal"
|
| 23 |
+
]
|
| 24 |
},
|
| 25 |
{
|
| 26 |
+
"name": "commit0",
|
| 27 |
+
"tags": [
|
| 28 |
+
"Overall",
|
| 29 |
+
"App Creation",
|
| 30 |
+
"commit0"
|
| 31 |
+
]
|
| 32 |
},
|
| 33 |
{
|
| 34 |
+
"name": "multi-swe-bench",
|
| 35 |
+
"tags": [
|
| 36 |
+
"Overall",
|
| 37 |
+
"Frontend Development",
|
| 38 |
+
"multi-swe-bench"
|
| 39 |
+
]
|
| 40 |
},
|
| 41 |
{
|
| 42 |
+
"name": "swt-bench",
|
| 43 |
+
"tags": [
|
| 44 |
+
"Overall",
|
| 45 |
+
"Test Generation",
|
| 46 |
+
"swt-bench"
|
| 47 |
+
]
|
| 48 |
},
|
| 49 |
{
|
| 50 |
"name": "gaia",
|
| 51 |
+
"tags": [
|
| 52 |
+
"Overall",
|
| 53 |
+
"Information Gathering",
|
| 54 |
+
"gaia"
|
| 55 |
+
]
|
| 56 |
}
|
| 57 |
]
|
| 58 |
},
|
| 59 |
{
|
| 60 |
+
"name": "validation",
|
| 61 |
"tasks": [
|
| 62 |
{
|
| 63 |
"name": "swe-bench",
|
| 64 |
+
"tags": [
|
| 65 |
+
"Overall",
|
| 66 |
+
"Bug Fixing",
|
| 67 |
+
"swe-bench"
|
| 68 |
+
]
|
| 69 |
},
|
| 70 |
{
|
| 71 |
+
"name": "swe-bench-multimodal",
|
| 72 |
+
"tags": [
|
| 73 |
+
"Overall",
|
| 74 |
+
"Bug Fixing",
|
| 75 |
+
"swe-bench-multimodal"
|
| 76 |
+
]
|
| 77 |
},
|
| 78 |
{
|
| 79 |
+
"name": "commit0",
|
| 80 |
+
"tags": [
|
| 81 |
+
"Overall",
|
| 82 |
+
"App Creation",
|
| 83 |
+
"commit0"
|
| 84 |
+
]
|
| 85 |
},
|
| 86 |
{
|
| 87 |
+
"name": "multi-swe-bench",
|
| 88 |
+
"tags": [
|
| 89 |
+
"Overall",
|
| 90 |
+
"Frontend Development",
|
| 91 |
+
"multi-swe-bench"
|
| 92 |
+
]
|
| 93 |
},
|
| 94 |
{
|
| 95 |
+
"name": "swt-bench",
|
| 96 |
+
"tags": [
|
| 97 |
+
"Overall",
|
| 98 |
+
"Test Generation",
|
| 99 |
+
"swt-bench"
|
| 100 |
+
]
|
| 101 |
},
|
| 102 |
{
|
| 103 |
"name": "gaia",
|
| 104 |
+
"tags": [
|
| 105 |
+
"Overall",
|
| 106 |
+
"Information Gathering",
|
| 107 |
+
"gaia"
|
| 108 |
+
]
|
| 109 |
}
|
| 110 |
]
|
| 111 |
}
|
| 112 |
]
|
| 113 |
}
|
| 114 |
+
}
|
simple_data_loader.py
CHANGED
|
@@ -36,15 +36,24 @@ class SimpleLeaderboardViewer:
|
|
| 36 |
"splits": []
|
| 37 |
}
|
| 38 |
|
| 39 |
-
# Build tag map from config
|
| 40 |
self.tag_map = {}
|
|
|
|
| 41 |
for split_config in self.suite_config.get("splits", []):
|
| 42 |
if split_config["name"] == split:
|
| 43 |
for task in split_config.get("tasks", []):
|
|
|
|
|
|
|
|
|
|
| 44 |
for tag in task.get("tags", []):
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
|
| 49 |
def _load(self):
|
| 50 |
"""Load the JSONL file for the split and return DataFrame and tag map."""
|
|
@@ -99,16 +108,34 @@ class SimpleLeaderboardViewer:
|
|
| 99 |
dataset_scores = []
|
| 100 |
dataset_costs = []
|
| 101 |
|
|
|
|
|
|
|
|
|
|
| 102 |
for _, row in agent_records.iterrows():
|
| 103 |
tags = row['tags'] if isinstance(row['tags'], list) else [row['tags']]
|
| 104 |
for tag in tags:
|
| 105 |
-
# Add columns for this specific dataset
|
| 106 |
record[f'{tag} score'] = row['score']
|
| 107 |
record[f'{tag} cost'] = row['total_cost']
|
| 108 |
dataset_scores.append(row['score'])
|
| 109 |
dataset_costs.append(row['total_cost'])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
|
| 111 |
-
# Calculate overall score and cost (average across
|
| 112 |
if dataset_scores:
|
| 113 |
record['overall score'] = sum(dataset_scores) / len(dataset_scores)
|
| 114 |
record['overall cost'] = sum(dataset_costs) / len(dataset_costs)
|
|
|
|
| 36 |
"splits": []
|
| 37 |
}
|
| 38 |
|
| 39 |
+
# Build tag map from config - organize benchmarks by category
|
| 40 |
self.tag_map = {}
|
| 41 |
+
self.benchmark_to_categories = {} # Maps benchmark name to its categories
|
| 42 |
for split_config in self.suite_config.get("splits", []):
|
| 43 |
if split_config["name"] == split:
|
| 44 |
for task in split_config.get("tasks", []):
|
| 45 |
+
task_name = task["name"]
|
| 46 |
+
# Store which categories this benchmark belongs to
|
| 47 |
+
self.benchmark_to_categories[task_name] = []
|
| 48 |
for tag in task.get("tags", []):
|
| 49 |
+
# Skip "Overall" and the benchmark's own name
|
| 50 |
+
if tag != "Overall" and tag != task_name:
|
| 51 |
+
# This is a category tag
|
| 52 |
+
if tag not in self.tag_map:
|
| 53 |
+
self.tag_map[tag] = []
|
| 54 |
+
if task_name not in self.tag_map[tag]:
|
| 55 |
+
self.tag_map[tag].append(task_name)
|
| 56 |
+
self.benchmark_to_categories[task_name].append(tag)
|
| 57 |
|
| 58 |
def _load(self):
|
| 59 |
"""Load the JSONL file for the split and return DataFrame and tag map."""
|
|
|
|
| 108 |
dataset_scores = []
|
| 109 |
dataset_costs = []
|
| 110 |
|
| 111 |
+
# Track category-level data for aggregation
|
| 112 |
+
category_data = {} # {category: {'scores': [...], 'costs': [...]}}
|
| 113 |
+
|
| 114 |
for _, row in agent_records.iterrows():
|
| 115 |
tags = row['tags'] if isinstance(row['tags'], list) else [row['tags']]
|
| 116 |
for tag in tags:
|
| 117 |
+
# Add columns for this specific dataset/benchmark
|
| 118 |
record[f'{tag} score'] = row['score']
|
| 119 |
record[f'{tag} cost'] = row['total_cost']
|
| 120 |
dataset_scores.append(row['score'])
|
| 121 |
dataset_costs.append(row['total_cost'])
|
| 122 |
+
|
| 123 |
+
# Track category-level data for aggregation
|
| 124 |
+
if tag in self.benchmark_to_categories:
|
| 125 |
+
for category in self.benchmark_to_categories[tag]:
|
| 126 |
+
if category not in category_data:
|
| 127 |
+
category_data[category] = {'scores': [], 'costs': []}
|
| 128 |
+
category_data[category]['scores'].append(row['score'])
|
| 129 |
+
category_data[category]['costs'].append(row['total_cost'])
|
| 130 |
+
|
| 131 |
+
# Calculate category-level aggregates
|
| 132 |
+
for category, data in category_data.items():
|
| 133 |
+
if data['scores']:
|
| 134 |
+
record[f'{category} score'] = sum(data['scores']) / len(data['scores'])
|
| 135 |
+
if data['costs']:
|
| 136 |
+
record[f'{category} cost'] = sum(data['costs']) / len(data['costs'])
|
| 137 |
|
| 138 |
+
# Calculate overall score and cost (average across all benchmarks)
|
| 139 |
if dataset_scores:
|
| 140 |
record['overall score'] = sum(dataset_scores) / len(dataset_scores)
|
| 141 |
record['overall cost'] = sum(dataset_costs) / len(dataset_costs)
|