Spaces:

OpenHands
/

openhands-index

Running

openhands openhands commited on 20 days ago

Commit

e003f7b

1 Parent(s): 0ee2099

Update data loader to support agent-centric directory structure

- Add support for new results/YYYYMMDD_model/ directory structure
- Each agent directory contains metadata.json and scores.json
- Maintain backward compatibility with old JSONL format
- Update mock data to use new structure

Co-authored-by: openhands <[email protected]>

Files changed (11) hide show

data/extracted/1.0.0-dev1/results/20251124_claude_3_5_sonnet_20241022/metadata.json +9 -0
data/extracted/1.0.0-dev1/results/20251124_claude_3_5_sonnet_20241022/scores.json +62 -0
data/extracted/1.0.0-dev1/results/20251124_claude_3_opus_20240229/metadata.json +9 -0
data/extracted/1.0.0-dev1/results/20251124_claude_3_opus_20240229/scores.json +62 -0
data/extracted/1.0.0-dev1/results/20251124_gpt_4_turbo_2024_04_09/metadata.json +9 -0
data/extracted/1.0.0-dev1/results/20251124_gpt_4_turbo_2024_04_09/scores.json +62 -0
data/extracted/1.0.0-dev1/results/20251124_gpt_4o_2024_11_20/metadata.json +9 -0
data/extracted/1.0.0-dev1/results/20251124_gpt_4o_2024_11_20/scores.json +62 -0
data/extracted/1.0.0-dev1/results/20251124_gpt_4o_mini_2024_07_18/metadata.json +9 -0
data/extracted/1.0.0-dev1/results/20251124_gpt_4o_mini_2024_07_18/scores.json +62 -0
simple_data_loader.py +80 -18

data/extracted/1.0.0-dev1/results/20251124_claude_3_5_sonnet_20241022/metadata.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "agent_name": "OpenHands CodeAct v2.1",
+  "agent_version": "OpenHands CodeAct v2.1",
+  "model": "claude-3-5-sonnet-20241022",
+  "openness": "closed_api_available",
+  "tool_usage": "standard",
+  "submission_time": "2025-11-24T19:56:00.092865",
+  "directory_name": "20251124_claude_3_5_sonnet_20241022"
+}

data/extracted/1.0.0-dev1/results/20251124_claude_3_5_sonnet_20241022/scores.json ADDED Viewed

	@@ -0,0 +1,62 @@

+[
+  {
+    "benchmark": "swe-bench",
+    "score": 48.3,
+    "metric": "resolve_rate",
+    "total_cost": 34.15,
+    "total_runtime": 541.5,
+    "tags": [
+      "swe-bench"
+    ]
+  },
+  {
+    "benchmark": "swe-bench-multimodal",
+    "score": 42.1,
+    "metric": "resolve_rate",
+    "total_cost": 31.05,
+    "total_runtime": 510.5,
+    "tags": [
+      "swe-bench-multimodal"
+    ]
+  },
+  {
+    "benchmark": "commit0",
+    "score": 71.2,
+    "metric": "test_pass_rate",
+    "total_cost": 45.6,
+    "total_runtime": 656.0,
+    "tags": [
+      "commit0"
+    ]
+  },
+  {
+    "benchmark": "multi-swe-bench",
+    "score": 35.2,
+    "metric": "resolve_rate",
+    "total_cost": 27.6,
+    "total_runtime": 476.0,
+    "tags": [
+      "multi-swe-bench"
+    ]
+  },
+  {
+    "benchmark": "swt-bench",
+    "score": 65.4,
+    "metric": "success_rate",
+    "total_cost": 42.7,
+    "total_runtime": 627.0,
+    "tags": [
+      "swt-bench"
+    ]
+  },
+  {
+    "benchmark": "gaia",
+    "score": 58.7,
+    "metric": "accuracy",
+    "total_cost": 39.35,
+    "total_runtime": 593.5,
+    "tags": [
+      "gaia"
+    ]
+  }
+]

data/extracted/1.0.0-dev1/results/20251124_claude_3_opus_20240229/metadata.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "agent_name": "SWE-Agent",
+  "agent_version": "SWE-Agent",
+  "model": "claude-3-opus-20240229",
+  "openness": "closed_api_available",
+  "tool_usage": "custom_interface",
+  "submission_time": "2025-11-24T19:56:00.092922",
+  "directory_name": "20251124_claude_3_opus_20240229"
+}

data/extracted/1.0.0-dev1/results/20251124_claude_3_opus_20240229/scores.json ADDED Viewed

	@@ -0,0 +1,62 @@

+[
+  {
+    "benchmark": "swe-bench",
+    "score": 29.8,
+    "metric": "resolve_rate",
+    "total_cost": 24.9,
+    "total_runtime": 449.0,
+    "tags": [
+      "swe-bench"
+    ]
+  },
+  {
+    "benchmark": "swe-bench-multimodal",
+    "score": 25.7,
+    "metric": "resolve_rate",
+    "total_cost": 22.85,
+    "total_runtime": 428.5,
+    "tags": [
+      "swe-bench-multimodal"
+    ]
+  },
+  {
+    "benchmark": "commit0",
+    "score": 52.1,
+    "metric": "test_pass_rate",
+    "total_cost": 36.05,
+    "total_runtime": 560.5,
+    "tags": [
+      "commit0"
+    ]
+  },
+  {
+    "benchmark": "multi-swe-bench",
+    "score": 21.5,
+    "metric": "resolve_rate",
+    "total_cost": 20.75,
+    "total_runtime": 407.5,
+    "tags": [
+      "multi-swe-bench"
+    ]
+  },
+  {
+    "benchmark": "swt-bench",
+    "score": 44.2,
+    "metric": "success_rate",
+    "total_cost": 32.1,
+    "total_runtime": 521.0,
+    "tags": [
+      "swt-bench"
+    ]
+  },
+  {
+    "benchmark": "gaia",
+    "score": 39.4,
+    "metric": "accuracy",
+    "total_cost": 29.7,
+    "total_runtime": 497.0,
+    "tags": [
+      "gaia"
+    ]
+  }
+]

data/extracted/1.0.0-dev1/results/20251124_gpt_4_turbo_2024_04_09/metadata.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "agent_name": "AutoCodeRover",
+  "agent_version": "AutoCodeRover",
+  "model": "gpt-4-turbo-2024-04-09",
+  "openness": "closed_api_available",
+  "tool_usage": "standard",
+  "submission_time": "2025-11-24T19:56:00.092908",
+  "directory_name": "20251124_gpt_4_turbo_2024_04_09"
+}

data/extracted/1.0.0-dev1/results/20251124_gpt_4_turbo_2024_04_09/scores.json ADDED Viewed

	@@ -0,0 +1,62 @@

+[
+  {
+    "benchmark": "swe-bench",
+    "score": 38.7,
+    "metric": "resolve_rate",
+    "total_cost": 29.35,
+    "total_runtime": 493.5,
+    "tags": [
+      "swe-bench"
+    ]
+  },
+  {
+    "benchmark": "swe-bench-multimodal",
+    "score": 34.2,
+    "metric": "resolve_rate",
+    "total_cost": 27.1,
+    "total_runtime": 471.0,
+    "tags": [
+      "swe-bench-multimodal"
+    ]
+  },
+  {
+    "benchmark": "commit0",
+    "score": 61.5,
+    "metric": "test_pass_rate",
+    "total_cost": 40.75,
+    "total_runtime": 607.5,
+    "tags": [
+      "commit0"
+    ]
+  },
+  {
+    "benchmark": "multi-swe-bench",
+    "score": 28.4,
+    "metric": "resolve_rate",
+    "total_cost": 24.2,
+    "total_runtime": 442.0,
+    "tags": [
+      "multi-swe-bench"
+    ]
+  },
+  {
+    "benchmark": "swt-bench",
+    "score": 54.1,
+    "metric": "success_rate",
+    "total_cost": 37.05,
+    "total_runtime": 570.5,
+    "tags": [
+      "swt-bench"
+    ]
+  },
+  {
+    "benchmark": "gaia",
+    "score": 48.3,
+    "metric": "accuracy",
+    "total_cost": 34.15,
+    "total_runtime": 541.5,
+    "tags": [
+      "gaia"
+    ]
+  }
+]

data/extracted/1.0.0-dev1/results/20251124_gpt_4o_2024_11_20/metadata.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "agent_name": "OpenHands CodeAct v2.0",
+  "agent_version": "OpenHands CodeAct v2.0",
+  "model": "gpt-4o-2024-11-20",
+  "openness": "closed_api_available",
+  "tool_usage": "standard",
+  "submission_time": "2025-11-24T19:56:00.092895",
+  "directory_name": "20251124_gpt_4o_2024_11_20"
+}

data/extracted/1.0.0-dev1/results/20251124_gpt_4o_2024_11_20/scores.json ADDED Viewed

	@@ -0,0 +1,62 @@

+[
+  {
+    "benchmark": "swe-bench",
+    "score": 45.1,
+    "metric": "resolve_rate",
+    "total_cost": 32.55,
+    "total_runtime": 525.5,
+    "tags": [
+      "swe-bench"
+    ]
+  },
+  {
+    "benchmark": "swe-bench-multimodal",
+    "score": 39.5,
+    "metric": "resolve_rate",
+    "total_cost": 29.75,
+    "total_runtime": 497.5,
+    "tags": [
+      "swe-bench-multimodal"
+    ]
+  },
+  {
+    "benchmark": "commit0",
+    "score": 68.9,
+    "metric": "test_pass_rate",
+    "total_cost": 44.45,
+    "total_runtime": 644.5,
+    "tags": [
+      "commit0"
+    ]
+  },
+  {
+    "benchmark": "multi-swe-bench",
+    "score": 32.8,
+    "metric": "resolve_rate",
+    "total_cost": 26.4,
+    "total_runtime": 464.0,
+    "tags": [
+      "multi-swe-bench"
+    ]
+  },
+  {
+    "benchmark": "swt-bench",
+    "score": 62.3,
+    "metric": "success_rate",
+    "total_cost": 41.15,
+    "total_runtime": 611.5,
+    "tags": [
+      "swt-bench"
+    ]
+  },
+  {
+    "benchmark": "gaia",
+    "score": 55.2,
+    "metric": "accuracy",
+    "total_cost": 37.6,
+    "total_runtime": 576.0,
+    "tags": [
+      "gaia"
+    ]
+  }
+]

data/extracted/1.0.0-dev1/results/20251124_gpt_4o_mini_2024_07_18/metadata.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "agent_name": "Agentless",
+  "agent_version": "Agentless",
+  "model": "gpt-4o-mini-2024-07-18",
+  "openness": "closed_api_available",
+  "tool_usage": "standard",
+  "submission_time": "2025-11-24T19:56:00.092916",
+  "directory_name": "20251124_gpt_4o_mini_2024_07_18"
+}

data/extracted/1.0.0-dev1/results/20251124_gpt_4o_mini_2024_07_18/scores.json ADDED Viewed

	@@ -0,0 +1,62 @@

+[
+  {
+    "benchmark": "swe-bench",
+    "score": 32.5,
+    "metric": "resolve_rate",
+    "total_cost": 26.25,
+    "total_runtime": 462.5,
+    "tags": [
+      "swe-bench"
+    ]
+  },
+  {
+    "benchmark": "swe-bench-multimodal",
+    "score": 28.9,
+    "metric": "resolve_rate",
+    "total_cost": 24.45,
+    "total_runtime": 444.5,
+    "tags": [
+      "swe-bench-multimodal"
+    ]
+  },
+  {
+    "benchmark": "commit0",
+    "score": 55.3,
+    "metric": "test_pass_rate",
+    "total_cost": 37.65,
+    "total_runtime": 576.5,
+    "tags": [
+      "commit0"
+    ]
+  },
+  {
+    "benchmark": "multi-swe-bench",
+    "score": 24.1,
+    "metric": "resolve_rate",
+    "total_cost": 22.05,
+    "total_runtime": 420.5,
+    "tags": [
+      "multi-swe-bench"
+    ]
+  },
+  {
+    "benchmark": "swt-bench",
+    "score": 47.8,
+    "metric": "success_rate",
+    "total_cost": 33.9,
+    "total_runtime": 539.0,
+    "tags": [
+      "swt-bench"
+    ]
+  },
+  {
+    "benchmark": "gaia",
+    "score": 42.1,
+    "metric": "accuracy",
+    "total_cost": 31.05,
+    "total_runtime": 510.5,
+    "tags": [
+      "gaia"
+    ]
+  }
+]

simple_data_loader.py CHANGED Viewed

@@ -55,31 +55,93 @@ class SimpleLeaderboardViewer:
                                 self.tag_map[tag].append(task_name)
                             self.benchmark_to_categories[task_name].append(tag)
     def _load(self):
         """Load the JSONL file for the split and return DataFrame and tag map."""
-        jsonl_file = self.config_path / f"{self.split}.jsonl"
-        if not jsonl_file.exists():
-            # Return empty dataframe with error message
-            return pd.DataFrame({
-                "Message": [f"No data found for split '{self.split}'. Expected file: {jsonl_file}"]
-            }), {}
-        try:
-            # Read JSONL file
-            records = []
-            with open(jsonl_file, 'r') as f:
-                for line in f:
-                    if line.strip():
-                        records.append(json.loads(line))
-            if not records:
                 return pd.DataFrame({
-                    "Message": [f"No data in file: {jsonl_file}"]
                 }), {}
-            # Convert to DataFrame
-            df = pd.DataFrame(records)
             # Transform to expected format for leaderboard
             # Group by agent to aggregate results across datasets

                                 self.tag_map[tag].append(task_name)
                             self.benchmark_to_categories[task_name].append(tag)
+    def _load_from_agent_dirs(self):
+        """Load data from new agent-centric directory structure (results/YYYYMMDD_model/)."""
+        results_dir = self.config_path / "results"
+        if not results_dir.exists():
+            return None  # Fall back to old format
+        all_records = []
+        # Iterate through each agent directory
+        for agent_dir in results_dir.iterdir():
+            if not agent_dir.is_dir():
+                continue
+            metadata_file = agent_dir / "metadata.json"
+            scores_file = agent_dir / "scores.json"
+            if not metadata_file.exists() or not scores_file.exists():
+                continue
+            # Load metadata and scores
+            with open(metadata_file) as f:
+                metadata = json.load(f)
+            with open(scores_file) as f:
+                scores = json.load(f)
+            # Create one record per benchmark (mimicking old JSONL format)
+            for score_entry in scores:
+                record = {
+                    'agent_name': metadata.get('agent_name', 'Unknown'),
+                    'llm_base': metadata.get('model', 'unknown'),
+                    'openness': metadata.get('openness', 'unknown'),
+                    'tool_usage': metadata.get('tool_usage', 'standard'),
+                    'submission_time': metadata.get('submission_time', ''),
+                    'score': score_entry.get('score'),
+                    'metric': score_entry.get('metric', 'unknown'),
+                    'total_cost': score_entry.get('total_cost'),
+                    'total_runtime': score_entry.get('total_runtime'),
+                    'tags': [score_entry.get('benchmark')],
+                }
+                all_records.append(record)
+        if not all_records:
+            return None  # Fall back to old format
+        return pd.DataFrame(all_records)
     def _load(self):
         """Load the JSONL file for the split and return DataFrame and tag map."""
+        # Try new format first (agent-centric directories)
+        df = self._load_from_agent_dirs()
+        if df is None:
+            # Fall back to old format (benchmark-centric JSONL)
+            jsonl_file = self.config_path / f"{self.split}.jsonl"
+            if not jsonl_file.exists():
+                # Return empty dataframe with error message
                 return pd.DataFrame({
+                    "Message": [f"No data found for split '{self.split}'. Expected file: {jsonl_file}"]
                 }), {}
+            try:
+                # Read JSONL file
+                records = []
+                with open(jsonl_file, 'r') as f:
+                    for line in f:
+                        if line.strip():
+                            records.append(json.loads(line))
+                if not records:
+                    return pd.DataFrame({
+                        "Message": [f"No data in file: {jsonl_file}"]
+                    }), {}
+                # Convert to DataFrame
+                df = pd.DataFrame(records)
+            except Exception as e:
+                import traceback
+                traceback.print_exc()
+                return pd.DataFrame({
+                    "Message": [f"Error loading data: {e}"]
+                }), {}
+        # Now process the dataframe (works for both old and new format)
+        try:
             # Transform to expected format for leaderboard
             # Group by agent to aggregate results across datasets