openhands-index / generate_mock_jsonl.py
openhands
Update UI: All-Hands-AI color scheme, agent version column names, and OpenHands logo
0ee2099
raw
history blame
5.3 kB
"""Generate mock results data in JSONL format for OpenHands Index."""
import json
import os
from pathlib import Path
from datetime import datetime
# Define the 6 benchmarks
BENCHMARKS = {
"swe-bench": {
"tags": ["swe-bench"],
"metric": "resolve_rate",
"metric_display": "Resolve Rate (%)"
},
"multi-swe-bench": {
"tags": ["multi-swe-bench"],
"metric": "resolve_rate",
"metric_display": "Resolve Rate (%)"
},
"swe-bench-multimodal": {
"tags": ["swe-bench-multimodal"],
"metric": "resolve_rate",
"metric_display": "Resolve Rate (%)"
},
"swt-bench": {
"tags": ["swt-bench"],
"metric": "success_rate",
"metric_display": "Success Rate (%)"
},
"commit0": {
"tags": ["commit0"],
"metric": "test_pass_rate",
"metric_display": "Test Pass Rate (%)"
},
"gaia": {
"tags": ["gaia"],
"metric": "accuracy",
"metric_display": "Accuracy (%)"
}
}
# Mock agents with realistic scores
MOCK_AGENTS = [
{
"agent_name": "1.0.2",
"llm_base": "claude-3-5-sonnet-20241022",
"openness": "closed_api_available",
"tool_usage": "standard",
"scores": {
"swe-bench": 48.3,
"multi-swe-bench": 35.2,
"swe-bench-multimodal": 42.1,
"swt-bench": 65.4,
"commit0": 71.2,
"gaia": 58.7
}
},
{
"agent_name": "1.0.1",
"llm_base": "gpt-4o-2024-11-20",
"openness": "closed_api_available",
"tool_usage": "standard",
"scores": {
"swe-bench": 45.1,
"multi-swe-bench": 32.8,
"swe-bench-multimodal": 39.5,
"swt-bench": 62.3,
"commit0": 68.9,
"gaia": 55.2
}
},
{
"agent_name": "1.0.0",
"llm_base": "gpt-4-turbo-2024-04-09",
"openness": "closed_api_available",
"tool_usage": "standard",
"scores": {
"swe-bench": 38.7,
"multi-swe-bench": 28.4,
"swe-bench-multimodal": 34.2,
"swt-bench": 54.1,
"commit0": 61.5,
"gaia": 48.3
}
},
{
"agent_name": "0.9.5",
"llm_base": "gpt-4o-mini-2024-07-18",
"openness": "closed_api_available",
"tool_usage": "standard",
"scores": {
"swe-bench": 32.5,
"multi-swe-bench": 24.1,
"swe-bench-multimodal": 28.9,
"swt-bench": 47.8,
"commit0": 55.3,
"gaia": 42.1
}
},
{
"agent_name": "0.9.0",
"llm_base": "claude-3-opus-20240229",
"openness": "closed_api_available",
"tool_usage": "custom_interface",
"scores": {
"swe-bench": 29.8,
"multi-swe-bench": 21.5,
"swe-bench-multimodal": 25.7,
"swt-bench": 44.2,
"commit0": 52.1,
"gaia": 39.4
}
},
]
def generate_mock_data():
"""Generate mock JSONL files for all benchmarks."""
output_dir = Path("mock_results/1.0.0-dev1")
output_dir.mkdir(parents=True, exist_ok=True)
# Create agenteval.json config
config = {
"suite_config": {
"name": "openhands-index",
"version": "1.0.0-dev1",
"splits": []
}
}
# Generate data for each benchmark
for benchmark_name, benchmark_info in BENCHMARKS.items():
print(f"Generating mock data for {benchmark_name}...")
# Add to config
config["suite_config"]["splits"].append({
"name": benchmark_name,
"tasks": [{
"name": benchmark_name,
"tags": benchmark_info["tags"]
}]
})
# Generate JSONL file
jsonl_path = output_dir / f"{benchmark_name}.jsonl"
with open(jsonl_path, 'w') as f:
for agent in MOCK_AGENTS:
record = {
"agent_name": agent["agent_name"],
"llm_base": agent["llm_base"],
"openness": agent["openness"],
"tool_usage": agent["tool_usage"],
"score": agent["scores"][benchmark_name],
"metric": benchmark_info["metric"],
"submission_time": datetime.now().isoformat(),
"tags": benchmark_info["tags"],
# Additional metadata
"total_cost": round(10.0 + agent["scores"][benchmark_name] * 0.5, 2),
"total_runtime": round(300 + agent["scores"][benchmark_name] * 5, 1),
}
f.write(json.dumps(record) + '\n')
print(f" Created {jsonl_path}")
# Write config file
config_path = output_dir / "agenteval.json"
with open(config_path, 'w') as f:
json.dump(config, f, indent=2)
print(f"\nCreated config: {config_path}")
print("\n✓ Mock data generation complete!")
print(f" Location: {output_dir}")
print(f" Benchmarks: {', '.join(BENCHMARKS.keys())}")
print(f" Agents: {len(MOCK_AGENTS)}")
if __name__ == "__main__":
generate_mock_data()