Spaces:

OpenHands
/

openhands-index

Running

openhands openhands commited on 21 days ago

Commit

5e9c3b9

1 Parent(s): c56f232

Fix category cost calculation - add category-level aggregation

- Updated simple_data_loader.py to build proper category-to-benchmarks mapping from agenteval.json
- Added logic to calculate category-level aggregate scores and costs from individual benchmark results
- Updated agenteval.json to include category tags (Bug Fixing, App Creation, etc.)
- Fixes issue where Bug Fixing cost was showing as 0.0 instead of the average of swe-bench and swe-bench-multimodal costs

Co-authored-by: openhands <[email protected]>

Files changed (2) hide show

data/1.0.0-dev1/agenteval.json +71 -47
simple_data_loader.py +33 -6

data/1.0.0-dev1/agenteval.json CHANGED Viewed

@@ -4,87 +4,111 @@
     "version": "1.0.0-dev1",
     "splits": [
       {
-        "name": "validation",
         "tasks": [
           {
             "name": "swe-bench",
-            "path": "openhands/swe-bench",
-            "primary_metric": "resolved/mean",
-            "tags": ["swe-bench"]
           },
           {
-            "name": "multi-swe-bench",
-            "path": "openhands/multi-swe-bench",
-            "primary_metric": "resolved/mean",
-            "tags": ["multi-swe-bench"]
           },
           {
-            "name": "swe-bench-multimodal",
-            "path": "openhands/swe-bench-multimodal",
-            "primary_metric": "resolved/mean",
-            "tags": ["swe-bench-multimodal"]
           },
           {
-            "name": "swt-bench",
-            "path": "openhands/swt-bench",
-            "primary_metric": "generated/mean",
-            "tags": ["swt-bench"]
           },
           {
-            "name": "commit0",
-            "path": "openhands/commit0",
-            "primary_metric": "tests_passed/mean",
-            "tags": ["commit0"]
           },
           {
             "name": "gaia",
-            "path": "openhands/gaia",
-            "primary_metric": "correct/mean",
-            "tags": ["gaia"]
           }
         ]
       },
       {
-        "name": "test",
         "tasks": [
           {
             "name": "swe-bench",
-            "path": "openhands/swe-bench",
-            "primary_metric": "resolved/mean",
-            "tags": ["swe-bench"]
           },
           {
-            "name": "multi-swe-bench",
-            "path": "openhands/multi-swe-bench",
-            "primary_metric": "resolved/mean",
-            "tags": ["multi-swe-bench"]
           },
           {
-            "name": "swe-bench-multimodal",
-            "path": "openhands/swe-bench-multimodal",
-            "primary_metric": "resolved/mean",
-            "tags": ["swe-bench-multimodal"]
           },
           {
-            "name": "swt-bench",
-            "path": "openhands/swt-bench",
-            "primary_metric": "generated/mean",
-            "tags": ["swt-bench"]
           },
           {
-            "name": "commit0",
-            "path": "openhands/commit0",
-            "primary_metric": "tests_passed/mean",
-            "tags": ["commit0"]
           },
           {
             "name": "gaia",
-            "path": "openhands/gaia",
-            "primary_metric": "correct/mean",
-            "tags": ["gaia"]
           }
         ]
       }
     ]
   }
-}

     "version": "1.0.0-dev1",
     "splits": [
       {
+        "name": "test",
         "tasks": [
           {
             "name": "swe-bench",
+            "tags": [
+              "Overall",
+              "Bug Fixing",
+              "swe-bench"
+            ]
           },
           {
+            "name": "swe-bench-multimodal",
+            "tags": [
+              "Overall",
+              "Bug Fixing",
+              "swe-bench-multimodal"
+            ]
           },
           {
+            "name": "commit0",
+            "tags": [
+              "Overall",
+              "App Creation",
+              "commit0"
+            ]
           },
           {
+            "name": "multi-swe-bench",
+            "tags": [
+              "Overall",
+              "Frontend Development",
+              "multi-swe-bench"
+            ]
           },
           {
+            "name": "swt-bench",
+            "tags": [
+              "Overall",
+              "Test Generation",
+              "swt-bench"
+            ]
           },
           {
             "name": "gaia",
+            "tags": [
+              "Overall",
+              "Information Gathering",
+              "gaia"
+            ]
           }
         ]
       },
       {
+        "name": "validation",
         "tasks": [
           {
             "name": "swe-bench",
+            "tags": [
+              "Overall",
+              "Bug Fixing",
+              "swe-bench"
+            ]
           },
           {
+            "name": "swe-bench-multimodal",
+            "tags": [
+              "Overall",
+              "Bug Fixing",
+              "swe-bench-multimodal"
+            ]
           },
           {
+            "name": "commit0",
+            "tags": [
+              "Overall",
+              "App Creation",
+              "commit0"
+            ]
           },
           {
+            "name": "multi-swe-bench",
+            "tags": [
+              "Overall",
+              "Frontend Development",
+              "multi-swe-bench"
+            ]
           },
           {
+            "name": "swt-bench",
+            "tags": [
+              "Overall",
+              "Test Generation",
+              "swt-bench"
+            ]
           },
           {
             "name": "gaia",
+            "tags": [
+              "Overall",
+              "Information Gathering",
+              "gaia"
+            ]
           }
         ]
       }
     ]
   }
+}

simple_data_loader.py CHANGED Viewed

@@ -36,15 +36,24 @@ class SimpleLeaderboardViewer:
                 "splits": []
             }
-        # Build tag map from config
         self.tag_map = {}
         for split_config in self.suite_config.get("splits", []):
             if split_config["name"] == split:
                 for task in split_config.get("tasks", []):
                     for tag in task.get("tags", []):
-                        if tag not in self.tag_map:
-                            self.tag_map[tag] = []
-                        self.tag_map[tag].append(task["name"])
     def _load(self):
         """Load the JSONL file for the split and return DataFrame and tag map."""
@@ -99,16 +108,34 @@ class SimpleLeaderboardViewer:
                 dataset_scores = []
                 dataset_costs = []
                 for _, row in agent_records.iterrows():
                     tags = row['tags'] if isinstance(row['tags'], list) else [row['tags']]
                     for tag in tags:
-                        # Add columns for this specific dataset
                         record[f'{tag} score'] = row['score']
                         record[f'{tag} cost'] = row['total_cost']
                         dataset_scores.append(row['score'])
                         dataset_costs.append(row['total_cost'])
-                # Calculate overall score and cost (average across datasets)
                 if dataset_scores:
                     record['overall score'] = sum(dataset_scores) / len(dataset_scores)
                     record['overall cost'] = sum(dataset_costs) / len(dataset_costs)

                 "splits": []
             }
+        # Build tag map from config - organize benchmarks by category
         self.tag_map = {}
+        self.benchmark_to_categories = {}  # Maps benchmark name to its categories
         for split_config in self.suite_config.get("splits", []):
             if split_config["name"] == split:
                 for task in split_config.get("tasks", []):
+                    task_name = task["name"]
+                    # Store which categories this benchmark belongs to
+                    self.benchmark_to_categories[task_name] = []
                     for tag in task.get("tags", []):
+                        # Skip "Overall" and the benchmark's own name
+                        if tag != "Overall" and tag != task_name:
+                            # This is a category tag
+                            if tag not in self.tag_map:
+                                self.tag_map[tag] = []
+                            if task_name not in self.tag_map[tag]:
+                                self.tag_map[tag].append(task_name)
+                            self.benchmark_to_categories[task_name].append(tag)
     def _load(self):
         """Load the JSONL file for the split and return DataFrame and tag map."""
                 dataset_scores = []
                 dataset_costs = []
+                # Track category-level data for aggregation
+                category_data = {}  # {category: {'scores': [...], 'costs': [...]}}
                 for _, row in agent_records.iterrows():
                     tags = row['tags'] if isinstance(row['tags'], list) else [row['tags']]
                     for tag in tags:
+                        # Add columns for this specific dataset/benchmark
                         record[f'{tag} score'] = row['score']
                         record[f'{tag} cost'] = row['total_cost']
                         dataset_scores.append(row['score'])
                         dataset_costs.append(row['total_cost'])
+                        # Track category-level data for aggregation
+                        if tag in self.benchmark_to_categories:
+                            for category in self.benchmark_to_categories[tag]:
+                                if category not in category_data:
+                                    category_data[category] = {'scores': [], 'costs': []}
+                                category_data[category]['scores'].append(row['score'])
+                                category_data[category]['costs'].append(row['total_cost'])
+                # Calculate category-level aggregates
+                for category, data in category_data.items():
+                    if data['scores']:
+                        record[f'{category} score'] = sum(data['scores']) / len(data['scores'])
+                    if data['costs']:
+                        record[f'{category} cost'] = sum(data['costs']) / len(data['costs'])
+                # Calculate overall score and cost (average across all benchmarks)
                 if dataset_scores:
                     record['overall score'] = sum(dataset_scores) / len(dataset_scores)
                     record['overall cost'] = sum(dataset_costs) / len(dataset_costs)