openhands openhands commited on
Commit
e003f7b
·
1 Parent(s): 0ee2099

Update data loader to support agent-centric directory structure

Browse files

- Add support for new results/YYYYMMDD_model/ directory structure
- Each agent directory contains metadata.json and scores.json
- Maintain backward compatibility with old JSONL format
- Update mock data to use new structure

Co-authored-by: openhands <[email protected]>

data/extracted/1.0.0-dev1/results/20251124_claude_3_5_sonnet_20241022/metadata.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "agent_name": "OpenHands CodeAct v2.1",
3
+ "agent_version": "OpenHands CodeAct v2.1",
4
+ "model": "claude-3-5-sonnet-20241022",
5
+ "openness": "closed_api_available",
6
+ "tool_usage": "standard",
7
+ "submission_time": "2025-11-24T19:56:00.092865",
8
+ "directory_name": "20251124_claude_3_5_sonnet_20241022"
9
+ }
data/extracted/1.0.0-dev1/results/20251124_claude_3_5_sonnet_20241022/scores.json ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "benchmark": "swe-bench",
4
+ "score": 48.3,
5
+ "metric": "resolve_rate",
6
+ "total_cost": 34.15,
7
+ "total_runtime": 541.5,
8
+ "tags": [
9
+ "swe-bench"
10
+ ]
11
+ },
12
+ {
13
+ "benchmark": "swe-bench-multimodal",
14
+ "score": 42.1,
15
+ "metric": "resolve_rate",
16
+ "total_cost": 31.05,
17
+ "total_runtime": 510.5,
18
+ "tags": [
19
+ "swe-bench-multimodal"
20
+ ]
21
+ },
22
+ {
23
+ "benchmark": "commit0",
24
+ "score": 71.2,
25
+ "metric": "test_pass_rate",
26
+ "total_cost": 45.6,
27
+ "total_runtime": 656.0,
28
+ "tags": [
29
+ "commit0"
30
+ ]
31
+ },
32
+ {
33
+ "benchmark": "multi-swe-bench",
34
+ "score": 35.2,
35
+ "metric": "resolve_rate",
36
+ "total_cost": 27.6,
37
+ "total_runtime": 476.0,
38
+ "tags": [
39
+ "multi-swe-bench"
40
+ ]
41
+ },
42
+ {
43
+ "benchmark": "swt-bench",
44
+ "score": 65.4,
45
+ "metric": "success_rate",
46
+ "total_cost": 42.7,
47
+ "total_runtime": 627.0,
48
+ "tags": [
49
+ "swt-bench"
50
+ ]
51
+ },
52
+ {
53
+ "benchmark": "gaia",
54
+ "score": 58.7,
55
+ "metric": "accuracy",
56
+ "total_cost": 39.35,
57
+ "total_runtime": 593.5,
58
+ "tags": [
59
+ "gaia"
60
+ ]
61
+ }
62
+ ]
data/extracted/1.0.0-dev1/results/20251124_claude_3_opus_20240229/metadata.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "agent_name": "SWE-Agent",
3
+ "agent_version": "SWE-Agent",
4
+ "model": "claude-3-opus-20240229",
5
+ "openness": "closed_api_available",
6
+ "tool_usage": "custom_interface",
7
+ "submission_time": "2025-11-24T19:56:00.092922",
8
+ "directory_name": "20251124_claude_3_opus_20240229"
9
+ }
data/extracted/1.0.0-dev1/results/20251124_claude_3_opus_20240229/scores.json ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "benchmark": "swe-bench",
4
+ "score": 29.8,
5
+ "metric": "resolve_rate",
6
+ "total_cost": 24.9,
7
+ "total_runtime": 449.0,
8
+ "tags": [
9
+ "swe-bench"
10
+ ]
11
+ },
12
+ {
13
+ "benchmark": "swe-bench-multimodal",
14
+ "score": 25.7,
15
+ "metric": "resolve_rate",
16
+ "total_cost": 22.85,
17
+ "total_runtime": 428.5,
18
+ "tags": [
19
+ "swe-bench-multimodal"
20
+ ]
21
+ },
22
+ {
23
+ "benchmark": "commit0",
24
+ "score": 52.1,
25
+ "metric": "test_pass_rate",
26
+ "total_cost": 36.05,
27
+ "total_runtime": 560.5,
28
+ "tags": [
29
+ "commit0"
30
+ ]
31
+ },
32
+ {
33
+ "benchmark": "multi-swe-bench",
34
+ "score": 21.5,
35
+ "metric": "resolve_rate",
36
+ "total_cost": 20.75,
37
+ "total_runtime": 407.5,
38
+ "tags": [
39
+ "multi-swe-bench"
40
+ ]
41
+ },
42
+ {
43
+ "benchmark": "swt-bench",
44
+ "score": 44.2,
45
+ "metric": "success_rate",
46
+ "total_cost": 32.1,
47
+ "total_runtime": 521.0,
48
+ "tags": [
49
+ "swt-bench"
50
+ ]
51
+ },
52
+ {
53
+ "benchmark": "gaia",
54
+ "score": 39.4,
55
+ "metric": "accuracy",
56
+ "total_cost": 29.7,
57
+ "total_runtime": 497.0,
58
+ "tags": [
59
+ "gaia"
60
+ ]
61
+ }
62
+ ]
data/extracted/1.0.0-dev1/results/20251124_gpt_4_turbo_2024_04_09/metadata.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "agent_name": "AutoCodeRover",
3
+ "agent_version": "AutoCodeRover",
4
+ "model": "gpt-4-turbo-2024-04-09",
5
+ "openness": "closed_api_available",
6
+ "tool_usage": "standard",
7
+ "submission_time": "2025-11-24T19:56:00.092908",
8
+ "directory_name": "20251124_gpt_4_turbo_2024_04_09"
9
+ }
data/extracted/1.0.0-dev1/results/20251124_gpt_4_turbo_2024_04_09/scores.json ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "benchmark": "swe-bench",
4
+ "score": 38.7,
5
+ "metric": "resolve_rate",
6
+ "total_cost": 29.35,
7
+ "total_runtime": 493.5,
8
+ "tags": [
9
+ "swe-bench"
10
+ ]
11
+ },
12
+ {
13
+ "benchmark": "swe-bench-multimodal",
14
+ "score": 34.2,
15
+ "metric": "resolve_rate",
16
+ "total_cost": 27.1,
17
+ "total_runtime": 471.0,
18
+ "tags": [
19
+ "swe-bench-multimodal"
20
+ ]
21
+ },
22
+ {
23
+ "benchmark": "commit0",
24
+ "score": 61.5,
25
+ "metric": "test_pass_rate",
26
+ "total_cost": 40.75,
27
+ "total_runtime": 607.5,
28
+ "tags": [
29
+ "commit0"
30
+ ]
31
+ },
32
+ {
33
+ "benchmark": "multi-swe-bench",
34
+ "score": 28.4,
35
+ "metric": "resolve_rate",
36
+ "total_cost": 24.2,
37
+ "total_runtime": 442.0,
38
+ "tags": [
39
+ "multi-swe-bench"
40
+ ]
41
+ },
42
+ {
43
+ "benchmark": "swt-bench",
44
+ "score": 54.1,
45
+ "metric": "success_rate",
46
+ "total_cost": 37.05,
47
+ "total_runtime": 570.5,
48
+ "tags": [
49
+ "swt-bench"
50
+ ]
51
+ },
52
+ {
53
+ "benchmark": "gaia",
54
+ "score": 48.3,
55
+ "metric": "accuracy",
56
+ "total_cost": 34.15,
57
+ "total_runtime": 541.5,
58
+ "tags": [
59
+ "gaia"
60
+ ]
61
+ }
62
+ ]
data/extracted/1.0.0-dev1/results/20251124_gpt_4o_2024_11_20/metadata.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "agent_name": "OpenHands CodeAct v2.0",
3
+ "agent_version": "OpenHands CodeAct v2.0",
4
+ "model": "gpt-4o-2024-11-20",
5
+ "openness": "closed_api_available",
6
+ "tool_usage": "standard",
7
+ "submission_time": "2025-11-24T19:56:00.092895",
8
+ "directory_name": "20251124_gpt_4o_2024_11_20"
9
+ }
data/extracted/1.0.0-dev1/results/20251124_gpt_4o_2024_11_20/scores.json ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "benchmark": "swe-bench",
4
+ "score": 45.1,
5
+ "metric": "resolve_rate",
6
+ "total_cost": 32.55,
7
+ "total_runtime": 525.5,
8
+ "tags": [
9
+ "swe-bench"
10
+ ]
11
+ },
12
+ {
13
+ "benchmark": "swe-bench-multimodal",
14
+ "score": 39.5,
15
+ "metric": "resolve_rate",
16
+ "total_cost": 29.75,
17
+ "total_runtime": 497.5,
18
+ "tags": [
19
+ "swe-bench-multimodal"
20
+ ]
21
+ },
22
+ {
23
+ "benchmark": "commit0",
24
+ "score": 68.9,
25
+ "metric": "test_pass_rate",
26
+ "total_cost": 44.45,
27
+ "total_runtime": 644.5,
28
+ "tags": [
29
+ "commit0"
30
+ ]
31
+ },
32
+ {
33
+ "benchmark": "multi-swe-bench",
34
+ "score": 32.8,
35
+ "metric": "resolve_rate",
36
+ "total_cost": 26.4,
37
+ "total_runtime": 464.0,
38
+ "tags": [
39
+ "multi-swe-bench"
40
+ ]
41
+ },
42
+ {
43
+ "benchmark": "swt-bench",
44
+ "score": 62.3,
45
+ "metric": "success_rate",
46
+ "total_cost": 41.15,
47
+ "total_runtime": 611.5,
48
+ "tags": [
49
+ "swt-bench"
50
+ ]
51
+ },
52
+ {
53
+ "benchmark": "gaia",
54
+ "score": 55.2,
55
+ "metric": "accuracy",
56
+ "total_cost": 37.6,
57
+ "total_runtime": 576.0,
58
+ "tags": [
59
+ "gaia"
60
+ ]
61
+ }
62
+ ]
data/extracted/1.0.0-dev1/results/20251124_gpt_4o_mini_2024_07_18/metadata.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "agent_name": "Agentless",
3
+ "agent_version": "Agentless",
4
+ "model": "gpt-4o-mini-2024-07-18",
5
+ "openness": "closed_api_available",
6
+ "tool_usage": "standard",
7
+ "submission_time": "2025-11-24T19:56:00.092916",
8
+ "directory_name": "20251124_gpt_4o_mini_2024_07_18"
9
+ }
data/extracted/1.0.0-dev1/results/20251124_gpt_4o_mini_2024_07_18/scores.json ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "benchmark": "swe-bench",
4
+ "score": 32.5,
5
+ "metric": "resolve_rate",
6
+ "total_cost": 26.25,
7
+ "total_runtime": 462.5,
8
+ "tags": [
9
+ "swe-bench"
10
+ ]
11
+ },
12
+ {
13
+ "benchmark": "swe-bench-multimodal",
14
+ "score": 28.9,
15
+ "metric": "resolve_rate",
16
+ "total_cost": 24.45,
17
+ "total_runtime": 444.5,
18
+ "tags": [
19
+ "swe-bench-multimodal"
20
+ ]
21
+ },
22
+ {
23
+ "benchmark": "commit0",
24
+ "score": 55.3,
25
+ "metric": "test_pass_rate",
26
+ "total_cost": 37.65,
27
+ "total_runtime": 576.5,
28
+ "tags": [
29
+ "commit0"
30
+ ]
31
+ },
32
+ {
33
+ "benchmark": "multi-swe-bench",
34
+ "score": 24.1,
35
+ "metric": "resolve_rate",
36
+ "total_cost": 22.05,
37
+ "total_runtime": 420.5,
38
+ "tags": [
39
+ "multi-swe-bench"
40
+ ]
41
+ },
42
+ {
43
+ "benchmark": "swt-bench",
44
+ "score": 47.8,
45
+ "metric": "success_rate",
46
+ "total_cost": 33.9,
47
+ "total_runtime": 539.0,
48
+ "tags": [
49
+ "swt-bench"
50
+ ]
51
+ },
52
+ {
53
+ "benchmark": "gaia",
54
+ "score": 42.1,
55
+ "metric": "accuracy",
56
+ "total_cost": 31.05,
57
+ "total_runtime": 510.5,
58
+ "tags": [
59
+ "gaia"
60
+ ]
61
+ }
62
+ ]
simple_data_loader.py CHANGED
@@ -55,31 +55,93 @@ class SimpleLeaderboardViewer:
55
  self.tag_map[tag].append(task_name)
56
  self.benchmark_to_categories[task_name].append(tag)
57
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  def _load(self):
59
  """Load the JSONL file for the split and return DataFrame and tag map."""
60
- jsonl_file = self.config_path / f"{self.split}.jsonl"
61
-
62
- if not jsonl_file.exists():
63
- # Return empty dataframe with error message
64
- return pd.DataFrame({
65
- "Message": [f"No data found for split '{self.split}'. Expected file: {jsonl_file}"]
66
- }), {}
67
 
68
- try:
69
- # Read JSONL file
70
- records = []
71
- with open(jsonl_file, 'r') as f:
72
- for line in f:
73
- if line.strip():
74
- records.append(json.loads(line))
75
 
76
- if not records:
 
77
  return pd.DataFrame({
78
- "Message": [f"No data in file: {jsonl_file}"]
79
  }), {}
80
 
81
- # Convert to DataFrame
82
- df = pd.DataFrame(records)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
 
84
  # Transform to expected format for leaderboard
85
  # Group by agent to aggregate results across datasets
 
55
  self.tag_map[tag].append(task_name)
56
  self.benchmark_to_categories[task_name].append(tag)
57
 
58
+ def _load_from_agent_dirs(self):
59
+ """Load data from new agent-centric directory structure (results/YYYYMMDD_model/)."""
60
+ results_dir = self.config_path / "results"
61
+
62
+ if not results_dir.exists():
63
+ return None # Fall back to old format
64
+
65
+ all_records = []
66
+
67
+ # Iterate through each agent directory
68
+ for agent_dir in results_dir.iterdir():
69
+ if not agent_dir.is_dir():
70
+ continue
71
+
72
+ metadata_file = agent_dir / "metadata.json"
73
+ scores_file = agent_dir / "scores.json"
74
+
75
+ if not metadata_file.exists() or not scores_file.exists():
76
+ continue
77
+
78
+ # Load metadata and scores
79
+ with open(metadata_file) as f:
80
+ metadata = json.load(f)
81
+
82
+ with open(scores_file) as f:
83
+ scores = json.load(f)
84
+
85
+ # Create one record per benchmark (mimicking old JSONL format)
86
+ for score_entry in scores:
87
+ record = {
88
+ 'agent_name': metadata.get('agent_name', 'Unknown'),
89
+ 'llm_base': metadata.get('model', 'unknown'),
90
+ 'openness': metadata.get('openness', 'unknown'),
91
+ 'tool_usage': metadata.get('tool_usage', 'standard'),
92
+ 'submission_time': metadata.get('submission_time', ''),
93
+ 'score': score_entry.get('score'),
94
+ 'metric': score_entry.get('metric', 'unknown'),
95
+ 'total_cost': score_entry.get('total_cost'),
96
+ 'total_runtime': score_entry.get('total_runtime'),
97
+ 'tags': [score_entry.get('benchmark')],
98
+ }
99
+ all_records.append(record)
100
+
101
+ if not all_records:
102
+ return None # Fall back to old format
103
+
104
+ return pd.DataFrame(all_records)
105
+
106
  def _load(self):
107
  """Load the JSONL file for the split and return DataFrame and tag map."""
108
+ # Try new format first (agent-centric directories)
109
+ df = self._load_from_agent_dirs()
 
 
 
 
 
110
 
111
+ if df is None:
112
+ # Fall back to old format (benchmark-centric JSONL)
113
+ jsonl_file = self.config_path / f"{self.split}.jsonl"
 
 
 
 
114
 
115
+ if not jsonl_file.exists():
116
+ # Return empty dataframe with error message
117
  return pd.DataFrame({
118
+ "Message": [f"No data found for split '{self.split}'. Expected file: {jsonl_file}"]
119
  }), {}
120
 
121
+ try:
122
+ # Read JSONL file
123
+ records = []
124
+ with open(jsonl_file, 'r') as f:
125
+ for line in f:
126
+ if line.strip():
127
+ records.append(json.loads(line))
128
+
129
+ if not records:
130
+ return pd.DataFrame({
131
+ "Message": [f"No data in file: {jsonl_file}"]
132
+ }), {}
133
+
134
+ # Convert to DataFrame
135
+ df = pd.DataFrame(records)
136
+ except Exception as e:
137
+ import traceback
138
+ traceback.print_exc()
139
+ return pd.DataFrame({
140
+ "Message": [f"Error loading data: {e}"]
141
+ }), {}
142
+
143
+ # Now process the dataframe (works for both old and new format)
144
+ try:
145
 
146
  # Transform to expected format for leaderboard
147
  # Group by agent to aggregate results across datasets