Spaces:
Running
Running
openhands
Update UI: All-Hands-AI color scheme, agent version column names, and OpenHands logo
0ee2099
| """Generate mock results data in JSONL format for OpenHands Index.""" | |
| import json | |
| import os | |
| from pathlib import Path | |
| from datetime import datetime | |
| # Define the 6 benchmarks | |
| BENCHMARKS = { | |
| "swe-bench": { | |
| "tags": ["swe-bench"], | |
| "metric": "resolve_rate", | |
| "metric_display": "Resolve Rate (%)" | |
| }, | |
| "multi-swe-bench": { | |
| "tags": ["multi-swe-bench"], | |
| "metric": "resolve_rate", | |
| "metric_display": "Resolve Rate (%)" | |
| }, | |
| "swe-bench-multimodal": { | |
| "tags": ["swe-bench-multimodal"], | |
| "metric": "resolve_rate", | |
| "metric_display": "Resolve Rate (%)" | |
| }, | |
| "swt-bench": { | |
| "tags": ["swt-bench"], | |
| "metric": "success_rate", | |
| "metric_display": "Success Rate (%)" | |
| }, | |
| "commit0": { | |
| "tags": ["commit0"], | |
| "metric": "test_pass_rate", | |
| "metric_display": "Test Pass Rate (%)" | |
| }, | |
| "gaia": { | |
| "tags": ["gaia"], | |
| "metric": "accuracy", | |
| "metric_display": "Accuracy (%)" | |
| } | |
| } | |
| # Mock agents with realistic scores | |
| MOCK_AGENTS = [ | |
| { | |
| "agent_name": "1.0.2", | |
| "llm_base": "claude-3-5-sonnet-20241022", | |
| "openness": "closed_api_available", | |
| "tool_usage": "standard", | |
| "scores": { | |
| "swe-bench": 48.3, | |
| "multi-swe-bench": 35.2, | |
| "swe-bench-multimodal": 42.1, | |
| "swt-bench": 65.4, | |
| "commit0": 71.2, | |
| "gaia": 58.7 | |
| } | |
| }, | |
| { | |
| "agent_name": "1.0.1", | |
| "llm_base": "gpt-4o-2024-11-20", | |
| "openness": "closed_api_available", | |
| "tool_usage": "standard", | |
| "scores": { | |
| "swe-bench": 45.1, | |
| "multi-swe-bench": 32.8, | |
| "swe-bench-multimodal": 39.5, | |
| "swt-bench": 62.3, | |
| "commit0": 68.9, | |
| "gaia": 55.2 | |
| } | |
| }, | |
| { | |
| "agent_name": "1.0.0", | |
| "llm_base": "gpt-4-turbo-2024-04-09", | |
| "openness": "closed_api_available", | |
| "tool_usage": "standard", | |
| "scores": { | |
| "swe-bench": 38.7, | |
| "multi-swe-bench": 28.4, | |
| "swe-bench-multimodal": 34.2, | |
| "swt-bench": 54.1, | |
| "commit0": 61.5, | |
| "gaia": 48.3 | |
| } | |
| }, | |
| { | |
| "agent_name": "0.9.5", | |
| "llm_base": "gpt-4o-mini-2024-07-18", | |
| "openness": "closed_api_available", | |
| "tool_usage": "standard", | |
| "scores": { | |
| "swe-bench": 32.5, | |
| "multi-swe-bench": 24.1, | |
| "swe-bench-multimodal": 28.9, | |
| "swt-bench": 47.8, | |
| "commit0": 55.3, | |
| "gaia": 42.1 | |
| } | |
| }, | |
| { | |
| "agent_name": "0.9.0", | |
| "llm_base": "claude-3-opus-20240229", | |
| "openness": "closed_api_available", | |
| "tool_usage": "custom_interface", | |
| "scores": { | |
| "swe-bench": 29.8, | |
| "multi-swe-bench": 21.5, | |
| "swe-bench-multimodal": 25.7, | |
| "swt-bench": 44.2, | |
| "commit0": 52.1, | |
| "gaia": 39.4 | |
| } | |
| }, | |
| ] | |
| def generate_mock_data(): | |
| """Generate mock JSONL files for all benchmarks.""" | |
| output_dir = Path("mock_results/1.0.0-dev1") | |
| output_dir.mkdir(parents=True, exist_ok=True) | |
| # Create agenteval.json config | |
| config = { | |
| "suite_config": { | |
| "name": "openhands-index", | |
| "version": "1.0.0-dev1", | |
| "splits": [] | |
| } | |
| } | |
| # Generate data for each benchmark | |
| for benchmark_name, benchmark_info in BENCHMARKS.items(): | |
| print(f"Generating mock data for {benchmark_name}...") | |
| # Add to config | |
| config["suite_config"]["splits"].append({ | |
| "name": benchmark_name, | |
| "tasks": [{ | |
| "name": benchmark_name, | |
| "tags": benchmark_info["tags"] | |
| }] | |
| }) | |
| # Generate JSONL file | |
| jsonl_path = output_dir / f"{benchmark_name}.jsonl" | |
| with open(jsonl_path, 'w') as f: | |
| for agent in MOCK_AGENTS: | |
| record = { | |
| "agent_name": agent["agent_name"], | |
| "llm_base": agent["llm_base"], | |
| "openness": agent["openness"], | |
| "tool_usage": agent["tool_usage"], | |
| "score": agent["scores"][benchmark_name], | |
| "metric": benchmark_info["metric"], | |
| "submission_time": datetime.now().isoformat(), | |
| "tags": benchmark_info["tags"], | |
| # Additional metadata | |
| "total_cost": round(10.0 + agent["scores"][benchmark_name] * 0.5, 2), | |
| "total_runtime": round(300 + agent["scores"][benchmark_name] * 5, 1), | |
| } | |
| f.write(json.dumps(record) + '\n') | |
| print(f" Created {jsonl_path}") | |
| # Write config file | |
| config_path = output_dir / "agenteval.json" | |
| with open(config_path, 'w') as f: | |
| json.dump(config, f, indent=2) | |
| print(f"\nCreated config: {config_path}") | |
| print("\n✓ Mock data generation complete!") | |
| print(f" Location: {output_dir}") | |
| print(f" Benchmarks: {', '.join(BENCHMARKS.keys())}") | |
| print(f" Agents: {len(MOCK_AGENTS)}") | |
| if __name__ == "__main__": | |
| generate_mock_data() | |