openhands openhands commited on
Commit
5e9c3b9
·
1 Parent(s): c56f232

Fix category cost calculation - add category-level aggregation

Browse files

- Updated simple_data_loader.py to build proper category-to-benchmarks mapping from agenteval.json
- Added logic to calculate category-level aggregate scores and costs from individual benchmark results
- Updated agenteval.json to include category tags (Bug Fixing, App Creation, etc.)
- Fixes issue where Bug Fixing cost was showing as 0.0 instead of the average of swe-bench and swe-bench-multimodal costs

Co-authored-by: openhands <[email protected]>

data/1.0.0-dev1/agenteval.json CHANGED
@@ -4,87 +4,111 @@
4
  "version": "1.0.0-dev1",
5
  "splits": [
6
  {
7
- "name": "validation",
8
  "tasks": [
9
  {
10
  "name": "swe-bench",
11
- "path": "openhands/swe-bench",
12
- "primary_metric": "resolved/mean",
13
- "tags": ["swe-bench"]
 
 
14
  },
15
  {
16
- "name": "multi-swe-bench",
17
- "path": "openhands/multi-swe-bench",
18
- "primary_metric": "resolved/mean",
19
- "tags": ["multi-swe-bench"]
 
 
20
  },
21
  {
22
- "name": "swe-bench-multimodal",
23
- "path": "openhands/swe-bench-multimodal",
24
- "primary_metric": "resolved/mean",
25
- "tags": ["swe-bench-multimodal"]
 
 
26
  },
27
  {
28
- "name": "swt-bench",
29
- "path": "openhands/swt-bench",
30
- "primary_metric": "generated/mean",
31
- "tags": ["swt-bench"]
 
 
32
  },
33
  {
34
- "name": "commit0",
35
- "path": "openhands/commit0",
36
- "primary_metric": "tests_passed/mean",
37
- "tags": ["commit0"]
 
 
38
  },
39
  {
40
  "name": "gaia",
41
- "path": "openhands/gaia",
42
- "primary_metric": "correct/mean",
43
- "tags": ["gaia"]
 
 
44
  }
45
  ]
46
  },
47
  {
48
- "name": "test",
49
  "tasks": [
50
  {
51
  "name": "swe-bench",
52
- "path": "openhands/swe-bench",
53
- "primary_metric": "resolved/mean",
54
- "tags": ["swe-bench"]
 
 
55
  },
56
  {
57
- "name": "multi-swe-bench",
58
- "path": "openhands/multi-swe-bench",
59
- "primary_metric": "resolved/mean",
60
- "tags": ["multi-swe-bench"]
 
 
61
  },
62
  {
63
- "name": "swe-bench-multimodal",
64
- "path": "openhands/swe-bench-multimodal",
65
- "primary_metric": "resolved/mean",
66
- "tags": ["swe-bench-multimodal"]
 
 
67
  },
68
  {
69
- "name": "swt-bench",
70
- "path": "openhands/swt-bench",
71
- "primary_metric": "generated/mean",
72
- "tags": ["swt-bench"]
 
 
73
  },
74
  {
75
- "name": "commit0",
76
- "path": "openhands/commit0",
77
- "primary_metric": "tests_passed/mean",
78
- "tags": ["commit0"]
 
 
79
  },
80
  {
81
  "name": "gaia",
82
- "path": "openhands/gaia",
83
- "primary_metric": "correct/mean",
84
- "tags": ["gaia"]
 
 
85
  }
86
  ]
87
  }
88
  ]
89
  }
90
- }
 
4
  "version": "1.0.0-dev1",
5
  "splits": [
6
  {
7
+ "name": "test",
8
  "tasks": [
9
  {
10
  "name": "swe-bench",
11
+ "tags": [
12
+ "Overall",
13
+ "Bug Fixing",
14
+ "swe-bench"
15
+ ]
16
  },
17
  {
18
+ "name": "swe-bench-multimodal",
19
+ "tags": [
20
+ "Overall",
21
+ "Bug Fixing",
22
+ "swe-bench-multimodal"
23
+ ]
24
  },
25
  {
26
+ "name": "commit0",
27
+ "tags": [
28
+ "Overall",
29
+ "App Creation",
30
+ "commit0"
31
+ ]
32
  },
33
  {
34
+ "name": "multi-swe-bench",
35
+ "tags": [
36
+ "Overall",
37
+ "Frontend Development",
38
+ "multi-swe-bench"
39
+ ]
40
  },
41
  {
42
+ "name": "swt-bench",
43
+ "tags": [
44
+ "Overall",
45
+ "Test Generation",
46
+ "swt-bench"
47
+ ]
48
  },
49
  {
50
  "name": "gaia",
51
+ "tags": [
52
+ "Overall",
53
+ "Information Gathering",
54
+ "gaia"
55
+ ]
56
  }
57
  ]
58
  },
59
  {
60
+ "name": "validation",
61
  "tasks": [
62
  {
63
  "name": "swe-bench",
64
+ "tags": [
65
+ "Overall",
66
+ "Bug Fixing",
67
+ "swe-bench"
68
+ ]
69
  },
70
  {
71
+ "name": "swe-bench-multimodal",
72
+ "tags": [
73
+ "Overall",
74
+ "Bug Fixing",
75
+ "swe-bench-multimodal"
76
+ ]
77
  },
78
  {
79
+ "name": "commit0",
80
+ "tags": [
81
+ "Overall",
82
+ "App Creation",
83
+ "commit0"
84
+ ]
85
  },
86
  {
87
+ "name": "multi-swe-bench",
88
+ "tags": [
89
+ "Overall",
90
+ "Frontend Development",
91
+ "multi-swe-bench"
92
+ ]
93
  },
94
  {
95
+ "name": "swt-bench",
96
+ "tags": [
97
+ "Overall",
98
+ "Test Generation",
99
+ "swt-bench"
100
+ ]
101
  },
102
  {
103
  "name": "gaia",
104
+ "tags": [
105
+ "Overall",
106
+ "Information Gathering",
107
+ "gaia"
108
+ ]
109
  }
110
  ]
111
  }
112
  ]
113
  }
114
+ }
simple_data_loader.py CHANGED
@@ -36,15 +36,24 @@ class SimpleLeaderboardViewer:
36
  "splits": []
37
  }
38
 
39
- # Build tag map from config
40
  self.tag_map = {}
 
41
  for split_config in self.suite_config.get("splits", []):
42
  if split_config["name"] == split:
43
  for task in split_config.get("tasks", []):
 
 
 
44
  for tag in task.get("tags", []):
45
- if tag not in self.tag_map:
46
- self.tag_map[tag] = []
47
- self.tag_map[tag].append(task["name"])
 
 
 
 
 
48
 
49
  def _load(self):
50
  """Load the JSONL file for the split and return DataFrame and tag map."""
@@ -99,16 +108,34 @@ class SimpleLeaderboardViewer:
99
  dataset_scores = []
100
  dataset_costs = []
101
 
 
 
 
102
  for _, row in agent_records.iterrows():
103
  tags = row['tags'] if isinstance(row['tags'], list) else [row['tags']]
104
  for tag in tags:
105
- # Add columns for this specific dataset
106
  record[f'{tag} score'] = row['score']
107
  record[f'{tag} cost'] = row['total_cost']
108
  dataset_scores.append(row['score'])
109
  dataset_costs.append(row['total_cost'])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
 
111
- # Calculate overall score and cost (average across datasets)
112
  if dataset_scores:
113
  record['overall score'] = sum(dataset_scores) / len(dataset_scores)
114
  record['overall cost'] = sum(dataset_costs) / len(dataset_costs)
 
36
  "splits": []
37
  }
38
 
39
+ # Build tag map from config - organize benchmarks by category
40
  self.tag_map = {}
41
+ self.benchmark_to_categories = {} # Maps benchmark name to its categories
42
  for split_config in self.suite_config.get("splits", []):
43
  if split_config["name"] == split:
44
  for task in split_config.get("tasks", []):
45
+ task_name = task["name"]
46
+ # Store which categories this benchmark belongs to
47
+ self.benchmark_to_categories[task_name] = []
48
  for tag in task.get("tags", []):
49
+ # Skip "Overall" and the benchmark's own name
50
+ if tag != "Overall" and tag != task_name:
51
+ # This is a category tag
52
+ if tag not in self.tag_map:
53
+ self.tag_map[tag] = []
54
+ if task_name not in self.tag_map[tag]:
55
+ self.tag_map[tag].append(task_name)
56
+ self.benchmark_to_categories[task_name].append(tag)
57
 
58
  def _load(self):
59
  """Load the JSONL file for the split and return DataFrame and tag map."""
 
108
  dataset_scores = []
109
  dataset_costs = []
110
 
111
+ # Track category-level data for aggregation
112
+ category_data = {} # {category: {'scores': [...], 'costs': [...]}}
113
+
114
  for _, row in agent_records.iterrows():
115
  tags = row['tags'] if isinstance(row['tags'], list) else [row['tags']]
116
  for tag in tags:
117
+ # Add columns for this specific dataset/benchmark
118
  record[f'{tag} score'] = row['score']
119
  record[f'{tag} cost'] = row['total_cost']
120
  dataset_scores.append(row['score'])
121
  dataset_costs.append(row['total_cost'])
122
+
123
+ # Track category-level data for aggregation
124
+ if tag in self.benchmark_to_categories:
125
+ for category in self.benchmark_to_categories[tag]:
126
+ if category not in category_data:
127
+ category_data[category] = {'scores': [], 'costs': []}
128
+ category_data[category]['scores'].append(row['score'])
129
+ category_data[category]['costs'].append(row['total_cost'])
130
+
131
+ # Calculate category-level aggregates
132
+ for category, data in category_data.items():
133
+ if data['scores']:
134
+ record[f'{category} score'] = sum(data['scores']) / len(data['scores'])
135
+ if data['costs']:
136
+ record[f'{category} cost'] = sum(data['costs']) / len(data['costs'])
137
 
138
+ # Calculate overall score and cost (average across all benchmarks)
139
  if dataset_scores:
140
  record['overall score'] = sum(dataset_scores) / len(dataset_scores)
141
  record['overall cost'] = sum(dataset_costs) / len(dataset_costs)