Spaces:
Running
Running
File size: 10,984 Bytes
1027cfb 5e9c3b9 1027cfb 5e9c3b9 1027cfb 5e9c3b9 1027cfb 5e9c3b9 1027cfb e003f7b 376500e e003f7b 1027cfb 855423e e003f7b 1027cfb e003f7b 855423e e003f7b 855423e e003f7b aa07520 376500e aa07520 ca754bb aa07520 0e14c25 ca754bb aa07520 376500e aa07520 5e9c3b9 aa07520 5e9c3b9 aa07520 5e9c3b9 e734bf6 5e9c3b9 e734bf6 5e9c3b9 e734bf6 aa07520 e734bf6 aa07520 e734bf6 aa07520 1027cfb aa07520 1027cfb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 |
"""
Simple data loader for OpenHands Index leaderboard.
Loads JSONL files from local directory or GitHub repository.
"""
import os
import pandas as pd
import json
from pathlib import Path
class SimpleLeaderboardViewer:
"""Simple replacement for agent-eval's LeaderboardViewer."""
def __init__(self, data_dir: str, config: str, split: str):
"""
Args:
data_dir: Path to data directory
config: Config name (e.g., "1.0.0-dev1")
split: Split name (e.g., "validation" or "test")
"""
self.data_dir = Path(data_dir)
self.config = config
self.split = split
self.config_path = self.data_dir / config
# Load suite configuration
config_file = self.config_path / "agenteval.json"
if config_file.exists():
with open(config_file) as f:
suite_config = json.load(f)
self.suite_config = suite_config["suite_config"]
else:
self.suite_config = {
"name": "openhands-index",
"version": config,
"splits": []
}
# Build tag map from config - organize benchmarks by category
self.tag_map = {}
self.benchmark_to_categories = {} # Maps benchmark name to its categories
for split_config in self.suite_config.get("splits", []):
if split_config["name"] == split:
for task in split_config.get("tasks", []):
task_name = task["name"]
# Store which categories this benchmark belongs to
self.benchmark_to_categories[task_name] = []
for tag in task.get("tags", []):
# Skip "Overall" and the benchmark's own name
if tag != "Overall" and tag != task_name:
# This is a category tag
if tag not in self.tag_map:
self.tag_map[tag] = []
if task_name not in self.tag_map[tag]:
self.tag_map[tag].append(task_name)
self.benchmark_to_categories[task_name].append(tag)
def _load_from_agent_dirs(self):
"""Load data from new agent-centric directory structure (results/YYYYMMDD_model/)."""
results_dir = self.config_path / "results"
if not results_dir.exists():
return None # Fall back to old format
all_records = []
# Iterate through each agent directory
for agent_dir in results_dir.iterdir():
if not agent_dir.is_dir():
continue
metadata_file = agent_dir / "metadata.json"
scores_file = agent_dir / "scores.json"
if not metadata_file.exists() or not scores_file.exists():
continue
# Load metadata and scores
with open(metadata_file) as f:
metadata = json.load(f)
with open(scores_file) as f:
scores = json.load(f)
# Create one record per benchmark (mimicking old JSONL format)
for score_entry in scores:
record = {
'agent_version': metadata.get('agent_version', 'Unknown'),
'llm_base': metadata.get('model', 'unknown'),
'openness': metadata.get('openness', 'unknown'),
'submission_time': metadata.get('submission_time', ''),
'score': score_entry.get('score'),
'metric': score_entry.get('metric', 'unknown'),
'total_cost': score_entry.get('total_cost'),
'total_runtime': score_entry.get('total_runtime'),
'tags': [score_entry.get('benchmark')],
}
all_records.append(record)
if not all_records:
return None # Fall back to old format
return pd.DataFrame(all_records)
def _load(self):
"""Load data from agent-centric directories and return DataFrame and tag map."""
df = self._load_from_agent_dirs()
if df is None:
# Return empty dataframe with error message
return pd.DataFrame({
"Message": [f"No data found for split '{self.split}' in results directory"]
}), {}
# Process the dataframe
try:
# Transform to expected format for leaderboard
# Group by agent to aggregate results across datasets
transformed_records = []
for agent_version in df['agent_version'].unique():
agent_records = df[df['agent_version'] == agent_version]
# Build a single record for this agent
first_record = agent_records.iloc[0]
# Normalize openness to "open" or "closed"
from aliases import OPENNESS_MAPPING
raw_openness = first_record['openness']
normalized_openness = OPENNESS_MAPPING.get(raw_openness, raw_openness)
record = {
# Core agent info - use final display names
'Openhands version': agent_version, # Will become "OpenHands Version"
'Language model': first_record['llm_base'], # Will become "Language Model"
'openness': normalized_openness, # Will become "Openness" (simplified to "open" or "closed")
'date': first_record['submission_time'], # Will become "Date"
# Additional columns expected by the transformer
'id': first_record.get('id', agent_version), # Will become "Id"
'source': first_record.get('source', ''), # Will become "Source"
'logs': first_record.get('logs', ''), # Will become "Logs"
}
# Add per-dataset scores and costs
dataset_scores = []
dataset_costs = []
# Track category-level data for aggregation
category_data = {} # {category: {'scores': [...], 'costs': [...]}}
for _, row in agent_records.iterrows():
tags = row['tags'] if isinstance(row['tags'], list) else [row['tags']]
for tag in tags:
# Add columns for this specific dataset/benchmark
record[f'{tag} score'] = row['score']
record[f'{tag} cost'] = row['total_cost']
dataset_scores.append(row['score'])
dataset_costs.append(row['total_cost'])
# Track category-level data for aggregation
if tag in self.benchmark_to_categories:
for category in self.benchmark_to_categories[tag]:
if category not in category_data:
category_data[category] = {'scores': [], 'costs': []}
category_data[category]['scores'].append(row['score'])
category_data[category]['costs'].append(row['total_cost'])
# Calculate category-level aggregates
category_avg_scores = []
category_avg_costs = []
for category, data in category_data.items():
if data['scores']:
avg_score = sum(data['scores']) / len(data['scores'])
record[f'{category} score'] = avg_score
category_avg_scores.append(avg_score)
if data['costs']:
avg_cost = sum(data['costs']) / len(data['costs'])
record[f'{category} cost'] = avg_cost
category_avg_costs.append(avg_cost)
# Calculate overall score and cost as macro-average of category averages
# This ensures each category contributes equally regardless of benchmark count
if category_avg_scores:
record['overall score'] = sum(category_avg_scores) / len(category_avg_scores)
else:
record['overall score'] = None
if category_avg_costs:
record['overall cost'] = sum(category_avg_costs) / len(category_avg_costs)
else:
record['overall cost'] = None
transformed_records.append(record)
transformed_df = pd.DataFrame(transformed_records)
# Build tag map if not already built
if not self.tag_map:
# Create simple tag map from the data
all_tags = set()
for _, row in df.iterrows():
tags = row['tags'] if isinstance(row['tags'], list) else [row['tags']]
all_tags.update(tags)
# Simple mapping: each tag maps to itself
self.tag_map = {tag: [tag] for tag in sorted(all_tags)}
return transformed_df, self.tag_map
except Exception as e:
import traceback
traceback.print_exc()
return pd.DataFrame({
"Message": [f"Error loading data: {e}"]
}), {}
def get_dataframe(self):
"""Get the raw dataframe."""
df, _ = self._load()
return df
def load_mock_data_locally(data_dir: str = "mock_results"):
"""
Load mock data from local directory for testing.
Args:
data_dir: Path to mock results directory
Returns:
Dictionary mapping split names to SimpleLeaderboardViewer instances
"""
viewers = {}
data_path = Path(data_dir)
if not data_path.exists():
print(f"Warning: Mock data directory '{data_dir}' not found")
return viewers
# Find all config directories
for config_dir in data_path.iterdir():
if config_dir.is_dir():
config_name = config_dir.name
# Find all JSONL files (each represents a split)
for jsonl_file in config_dir.glob("*.jsonl"):
split_name = jsonl_file.stem
viewer = SimpleLeaderboardViewer(
data_dir=str(data_path),
config=config_name,
split=split_name
)
viewers[split_name] = viewer
return viewers
|