Spaces:
Running
Running
openhands
Update UI: All-Hands-AI color scheme, agent version column names, and OpenHands logo
0ee2099
| """ | |
| Simple data loader for OpenHands Index leaderboard. | |
| Loads JSONL files from local directory or GitHub repository. | |
| """ | |
| import os | |
| import pandas as pd | |
| import json | |
| from pathlib import Path | |
| class SimpleLeaderboardViewer: | |
| """Simple replacement for agent-eval's LeaderboardViewer.""" | |
| def __init__(self, data_dir: str, config: str, split: str): | |
| """ | |
| Args: | |
| data_dir: Path to data directory | |
| config: Config name (e.g., "1.0.0-dev1") | |
| split: Split name (e.g., "validation" or "test") | |
| """ | |
| self.data_dir = Path(data_dir) | |
| self.config = config | |
| self.split = split | |
| self.config_path = self.data_dir / config | |
| # Load suite configuration | |
| config_file = self.config_path / "agenteval.json" | |
| if config_file.exists(): | |
| with open(config_file) as f: | |
| suite_config = json.load(f) | |
| self.suite_config = suite_config["suite_config"] | |
| else: | |
| self.suite_config = { | |
| "name": "openhands-index", | |
| "version": config, | |
| "splits": [] | |
| } | |
| # Build tag map from config - organize benchmarks by category | |
| self.tag_map = {} | |
| self.benchmark_to_categories = {} # Maps benchmark name to its categories | |
| for split_config in self.suite_config.get("splits", []): | |
| if split_config["name"] == split: | |
| for task in split_config.get("tasks", []): | |
| task_name = task["name"] | |
| # Store which categories this benchmark belongs to | |
| self.benchmark_to_categories[task_name] = [] | |
| for tag in task.get("tags", []): | |
| # Skip "Overall" and the benchmark's own name | |
| if tag != "Overall" and tag != task_name: | |
| # This is a category tag | |
| if tag not in self.tag_map: | |
| self.tag_map[tag] = [] | |
| if task_name not in self.tag_map[tag]: | |
| self.tag_map[tag].append(task_name) | |
| self.benchmark_to_categories[task_name].append(tag) | |
| def _load(self): | |
| """Load the JSONL file for the split and return DataFrame and tag map.""" | |
| jsonl_file = self.config_path / f"{self.split}.jsonl" | |
| if not jsonl_file.exists(): | |
| # Return empty dataframe with error message | |
| return pd.DataFrame({ | |
| "Message": [f"No data found for split '{self.split}'. Expected file: {jsonl_file}"] | |
| }), {} | |
| try: | |
| # Read JSONL file | |
| records = [] | |
| with open(jsonl_file, 'r') as f: | |
| for line in f: | |
| if line.strip(): | |
| records.append(json.loads(line)) | |
| if not records: | |
| return pd.DataFrame({ | |
| "Message": [f"No data in file: {jsonl_file}"] | |
| }), {} | |
| # Convert to DataFrame | |
| df = pd.DataFrame(records) | |
| # Transform to expected format for leaderboard | |
| # Group by agent to aggregate results across datasets | |
| transformed_records = [] | |
| for agent_name in df['agent_name'].unique(): | |
| agent_records = df[df['agent_name'] == agent_name] | |
| # Build a single record for this agent | |
| first_record = agent_records.iloc[0] | |
| record = { | |
| # Core agent info - use final display names | |
| 'agent': agent_name, # Will become "Agent Version" after prettifying | |
| 'models used': first_record['llm_base'], # Will become "Model" | |
| 'openness': first_record['openness'], # Will become "Openness" | |
| 'agent tooling': first_record['tool_usage'], # Will become "Agent Tooling" | |
| 'date': first_record['submission_time'], # Will become "Date" | |
| # Additional columns expected by the transformer | |
| 'id': first_record.get('id', agent_name), # Will become "Id" | |
| 'source': first_record.get('source', ''), # Will become "Source" | |
| 'logs': first_record.get('logs', ''), # Will become "Logs" | |
| } | |
| # Add per-dataset scores and costs | |
| dataset_scores = [] | |
| dataset_costs = [] | |
| # Track category-level data for aggregation | |
| category_data = {} # {category: {'scores': [...], 'costs': [...]}} | |
| for _, row in agent_records.iterrows(): | |
| tags = row['tags'] if isinstance(row['tags'], list) else [row['tags']] | |
| for tag in tags: | |
| # Add columns for this specific dataset/benchmark | |
| record[f'{tag} score'] = row['score'] | |
| record[f'{tag} cost'] = row['total_cost'] | |
| dataset_scores.append(row['score']) | |
| dataset_costs.append(row['total_cost']) | |
| # Track category-level data for aggregation | |
| if tag in self.benchmark_to_categories: | |
| for category in self.benchmark_to_categories[tag]: | |
| if category not in category_data: | |
| category_data[category] = {'scores': [], 'costs': []} | |
| category_data[category]['scores'].append(row['score']) | |
| category_data[category]['costs'].append(row['total_cost']) | |
| # Calculate category-level aggregates | |
| for category, data in category_data.items(): | |
| if data['scores']: | |
| record[f'{category} score'] = sum(data['scores']) / len(data['scores']) | |
| if data['costs']: | |
| record[f'{category} cost'] = sum(data['costs']) / len(data['costs']) | |
| # Calculate overall score and cost (average across all benchmarks) | |
| if dataset_scores: | |
| record['overall score'] = sum(dataset_scores) / len(dataset_scores) | |
| record['overall cost'] = sum(dataset_costs) / len(dataset_costs) | |
| else: | |
| record['overall score'] = None | |
| record['overall cost'] = None | |
| transformed_records.append(record) | |
| transformed_df = pd.DataFrame(transformed_records) | |
| # Build tag map if not already built | |
| if not self.tag_map: | |
| # Create simple tag map from the data | |
| all_tags = set() | |
| for _, row in df.iterrows(): | |
| tags = row['tags'] if isinstance(row['tags'], list) else [row['tags']] | |
| all_tags.update(tags) | |
| # Simple mapping: each tag maps to itself | |
| self.tag_map = {tag: [tag] for tag in sorted(all_tags)} | |
| return transformed_df, self.tag_map | |
| except Exception as e: | |
| import traceback | |
| traceback.print_exc() | |
| return pd.DataFrame({ | |
| "Message": [f"Error loading data: {e}"] | |
| }), {} | |
| def get_dataframe(self): | |
| """Get the raw dataframe.""" | |
| df, _ = self._load() | |
| return df | |
| def load_mock_data_locally(data_dir: str = "mock_results"): | |
| """ | |
| Load mock data from local directory for testing. | |
| Args: | |
| data_dir: Path to mock results directory | |
| Returns: | |
| Dictionary mapping split names to SimpleLeaderboardViewer instances | |
| """ | |
| viewers = {} | |
| data_path = Path(data_dir) | |
| if not data_path.exists(): | |
| print(f"Warning: Mock data directory '{data_dir}' not found") | |
| return viewers | |
| # Find all config directories | |
| for config_dir in data_path.iterdir(): | |
| if config_dir.is_dir(): | |
| config_name = config_dir.name | |
| # Find all JSONL files (each represents a split) | |
| for jsonl_file in config_dir.glob("*.jsonl"): | |
| split_name = jsonl_file.stem | |
| viewer = SimpleLeaderboardViewer( | |
| data_dir=str(data_path), | |
| config=config_name, | |
| split=split_name | |
| ) | |
| viewers[split_name] = viewer | |
| return viewers | |