openhands-index / generate_mock_jsonl.py
openhands
Simplify leaderboard to open vs closed models only
ca754bb
raw
history blame
5 kB
"""Generate mock results data in JSONL format for OpenHands Index."""
import json
import os
from pathlib import Path
from datetime import datetime
# Define the 6 benchmarks
BENCHMARKS = {
"swe-bench": {
"tags": ["swe-bench"],
"metric": "resolve_rate",
"metric_display": "Resolve Rate (%)"
},
"multi-swe-bench": {
"tags": ["multi-swe-bench"],
"metric": "resolve_rate",
"metric_display": "Resolve Rate (%)"
},
"swe-bench-multimodal": {
"tags": ["swe-bench-multimodal"],
"metric": "resolve_rate",
"metric_display": "Resolve Rate (%)"
},
"swt-bench": {
"tags": ["swt-bench"],
"metric": "success_rate",
"metric_display": "Success Rate (%)"
},
"commit0": {
"tags": ["commit0"],
"metric": "test_pass_rate",
"metric_display": "Test Pass Rate (%)"
},
"gaia": {
"tags": ["gaia"],
"metric": "accuracy",
"metric_display": "Accuracy (%)"
}
}
# Mock agents with realistic scores
MOCK_AGENTS = [
{
"agent_name": "1.0.2",
"llm_base": "claude-3-5-sonnet-20241022",
"openness": "closed",
"scores": {
"swe-bench": 48.3,
"multi-swe-bench": 35.2,
"swe-bench-multimodal": 42.1,
"swt-bench": 65.4,
"commit0": 71.2,
"gaia": 58.7
}
},
{
"agent_name": "1.0.1",
"llm_base": "gpt-4o-2024-11-20",
"openness": "closed",
"scores": {
"swe-bench": 45.1,
"multi-swe-bench": 32.8,
"swe-bench-multimodal": 39.5,
"swt-bench": 62.3,
"commit0": 68.9,
"gaia": 55.2
}
},
{
"agent_name": "1.0.0",
"llm_base": "gpt-4-turbo-2024-04-09",
"openness": "closed",
"scores": {
"swe-bench": 38.7,
"multi-swe-bench": 28.4,
"swe-bench-multimodal": 34.2,
"swt-bench": 54.1,
"commit0": 61.5,
"gaia": 48.3
}
},
{
"agent_name": "0.9.5",
"llm_base": "gpt-4o-mini-2024-07-18",
"openness": "closed",
"scores": {
"swe-bench": 32.5,
"multi-swe-bench": 24.1,
"swe-bench-multimodal": 28.9,
"swt-bench": 47.8,
"commit0": 55.3,
"gaia": 42.1
}
},
{
"agent_name": "0.9.0",
"llm_base": "claude-3-opus-20240229",
"openness": "closed",
"scores": {
"swe-bench": 29.8,
"multi-swe-bench": 21.5,
"swe-bench-multimodal": 25.7,
"swt-bench": 44.2,
"commit0": 52.1,
"gaia": 39.4
}
},
]
def generate_mock_data():
"""Generate mock JSONL files for all benchmarks."""
output_dir = Path("mock_results/1.0.0-dev1")
output_dir.mkdir(parents=True, exist_ok=True)
# Create agenteval.json config
config = {
"suite_config": {
"name": "openhands-index",
"version": "1.0.0-dev1",
"splits": []
}
}
# Generate data for each benchmark
for benchmark_name, benchmark_info in BENCHMARKS.items():
print(f"Generating mock data for {benchmark_name}...")
# Add to config
config["suite_config"]["splits"].append({
"name": benchmark_name,
"tasks": [{
"name": benchmark_name,
"tags": benchmark_info["tags"]
}]
})
# Generate JSONL file
jsonl_path = output_dir / f"{benchmark_name}.jsonl"
with open(jsonl_path, 'w') as f:
for agent in MOCK_AGENTS:
record = {
"agent_name": agent["agent_name"],
"llm_base": agent["llm_base"],
"openness": agent["openness"],
"score": agent["scores"][benchmark_name],
"metric": benchmark_info["metric"],
"submission_time": datetime.now().isoformat(),
"tags": benchmark_info["tags"],
# Additional metadata
"total_cost": round(10.0 + agent["scores"][benchmark_name] * 0.5, 2),
"total_runtime": round(300 + agent["scores"][benchmark_name] * 5, 1),
}
f.write(json.dumps(record) + '\n')
print(f" Created {jsonl_path}")
# Write config file
config_path = output_dir / "agenteval.json"
with open(config_path, 'w') as f:
json.dump(config, f, indent=2)
print(f"\nCreated config: {config_path}")
print("\n✓ Mock data generation complete!")
print(f" Location: {output_dir}")
print(f" Benchmarks: {', '.join(BENCHMARKS.keys())}")
print(f" Agents: {len(MOCK_AGENTS)}")
if __name__ == "__main__":
generate_mock_data()