"""Generate mock results data in JSONL format for OpenHands Index.""" import json import os from pathlib import Path from datetime import datetime # Define the 6 benchmarks BENCHMARKS = { "swe-bench": { "tags": ["swe-bench"], "metric": "resolve_rate", "metric_display": "Resolve Rate (%)" }, "multi-swe-bench": { "tags": ["multi-swe-bench"], "metric": "resolve_rate", "metric_display": "Resolve Rate (%)" }, "swe-bench-multimodal": { "tags": ["swe-bench-multimodal"], "metric": "resolve_rate", "metric_display": "Resolve Rate (%)" }, "swt-bench": { "tags": ["swt-bench"], "metric": "success_rate", "metric_display": "Success Rate (%)" }, "commit0": { "tags": ["commit0"], "metric": "test_pass_rate", "metric_display": "Test Pass Rate (%)" }, "gaia": { "tags": ["gaia"], "metric": "accuracy", "metric_display": "Accuracy (%)" } } # Mock agents with realistic scores MOCK_AGENTS = [ { "agent_name": "1.0.2", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed", "scores": { "swe-bench": 48.3, "multi-swe-bench": 35.2, "swe-bench-multimodal": 42.1, "swt-bench": 65.4, "commit0": 71.2, "gaia": 58.7 } }, { "agent_name": "1.0.1", "llm_base": "gpt-4o-2024-11-20", "openness": "closed", "scores": { "swe-bench": 45.1, "multi-swe-bench": 32.8, "swe-bench-multimodal": 39.5, "swt-bench": 62.3, "commit0": 68.9, "gaia": 55.2 } }, { "agent_name": "1.0.0", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed", "scores": { "swe-bench": 38.7, "multi-swe-bench": 28.4, "swe-bench-multimodal": 34.2, "swt-bench": 54.1, "commit0": 61.5, "gaia": 48.3 } }, { "agent_name": "0.9.5", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed", "scores": { "swe-bench": 32.5, "multi-swe-bench": 24.1, "swe-bench-multimodal": 28.9, "swt-bench": 47.8, "commit0": 55.3, "gaia": 42.1 } }, { "agent_name": "0.9.0", "llm_base": "claude-3-opus-20240229", "openness": "closed", "scores": { "swe-bench": 29.8, "multi-swe-bench": 21.5, "swe-bench-multimodal": 25.7, "swt-bench": 44.2, "commit0": 52.1, "gaia": 39.4 } }, ] def generate_mock_data(): """Generate mock JSONL files for all benchmarks.""" output_dir = Path("mock_results/1.0.0-dev1") output_dir.mkdir(parents=True, exist_ok=True) # Create agenteval.json config config = { "suite_config": { "name": "openhands-index", "version": "1.0.0-dev1", "splits": [] } } # Generate data for each benchmark for benchmark_name, benchmark_info in BENCHMARKS.items(): print(f"Generating mock data for {benchmark_name}...") # Add to config config["suite_config"]["splits"].append({ "name": benchmark_name, "tasks": [{ "name": benchmark_name, "tags": benchmark_info["tags"] }] }) # Generate JSONL file jsonl_path = output_dir / f"{benchmark_name}.jsonl" with open(jsonl_path, 'w') as f: for agent in MOCK_AGENTS: record = { "agent_name": agent["agent_name"], "llm_base": agent["llm_base"], "openness": agent["openness"], "score": agent["scores"][benchmark_name], "metric": benchmark_info["metric"], "submission_time": datetime.now().isoformat(), "tags": benchmark_info["tags"], # Additional metadata "total_cost": round(10.0 + agent["scores"][benchmark_name] * 0.5, 2), "total_runtime": round(300 + agent["scores"][benchmark_name] * 5, 1), } f.write(json.dumps(record) + '\n') print(f" Created {jsonl_path}") # Write config file config_path = output_dir / "agenteval.json" with open(config_path, 'w') as f: json.dump(config, f, indent=2) print(f"\nCreated config: {config_path}") print("\n✓ Mock data generation complete!") print(f" Location: {output_dir}") print(f" Benchmarks: {', '.join(BENCHMARKS.keys())}") print(f" Agents: {len(MOCK_AGENTS)}") if __name__ == "__main__": generate_mock_data()