Spaces:

OpenHands
/

openhands-index

Running

openhands-index / generate_mock_jsonl.py

openhands

Simplify leaderboard to open vs closed models only

ca754bb 22 days ago

5 kB

	"""Generate mock results data in JSONL format for OpenHands Index."""
	import json
	import os
	from pathlib import Path
	from datetime import datetime

	# Define the 6 benchmarks
	BENCHMARKS = {
	"swe-bench": {
	"tags": ["swe-bench"],
	"metric": "resolve_rate",
	"metric_display": "Resolve Rate (%)"
	},
	"multi-swe-bench": {
	"tags": ["multi-swe-bench"],
	"metric": "resolve_rate",
	"metric_display": "Resolve Rate (%)"
	},
	"swe-bench-multimodal": {
	"tags": ["swe-bench-multimodal"],
	"metric": "resolve_rate",
	"metric_display": "Resolve Rate (%)"
	},
	"swt-bench": {
	"tags": ["swt-bench"],
	"metric": "success_rate",
	"metric_display": "Success Rate (%)"
	},
	"commit0": {
	"tags": ["commit0"],
	"metric": "test_pass_rate",
	"metric_display": "Test Pass Rate (%)"
	},
	"gaia": {
	"tags": ["gaia"],
	"metric": "accuracy",
	"metric_display": "Accuracy (%)"
	}
	}

	# Mock agents with realistic scores
	MOCK_AGENTS = [
	{
	"agent_name": "1.0.2",
	"llm_base": "claude-3-5-sonnet-20241022",
	"openness": "closed",
	"scores": {
	"swe-bench": 48.3,
	"multi-swe-bench": 35.2,
	"swe-bench-multimodal": 42.1,
	"swt-bench": 65.4,
	"commit0": 71.2,
	"gaia": 58.7
	}
	},
	{
	"agent_name": "1.0.1",
	"llm_base": "gpt-4o-2024-11-20",
	"openness": "closed",
	"scores": {
	"swe-bench": 45.1,
	"multi-swe-bench": 32.8,
	"swe-bench-multimodal": 39.5,
	"swt-bench": 62.3,
	"commit0": 68.9,
	"gaia": 55.2
	}
	},
	{
	"agent_name": "1.0.0",
	"llm_base": "gpt-4-turbo-2024-04-09",
	"openness": "closed",
	"scores": {
	"swe-bench": 38.7,
	"multi-swe-bench": 28.4,
	"swe-bench-multimodal": 34.2,
	"swt-bench": 54.1,
	"commit0": 61.5,
	"gaia": 48.3
	}
	},
	{
	"agent_name": "0.9.5",
	"llm_base": "gpt-4o-mini-2024-07-18",
	"openness": "closed",
	"scores": {
	"swe-bench": 32.5,
	"multi-swe-bench": 24.1,
	"swe-bench-multimodal": 28.9,
	"swt-bench": 47.8,
	"commit0": 55.3,
	"gaia": 42.1
	}
	},
	{
	"agent_name": "0.9.0",
	"llm_base": "claude-3-opus-20240229",
	"openness": "closed",
	"scores": {
	"swe-bench": 29.8,
	"multi-swe-bench": 21.5,
	"swe-bench-multimodal": 25.7,
	"swt-bench": 44.2,
	"commit0": 52.1,
	"gaia": 39.4
	}
	},
	]


	def generate_mock_data():
	"""Generate mock JSONL files for all benchmarks."""
	output_dir = Path("mock_results/1.0.0-dev1")
	output_dir.mkdir(parents=True, exist_ok=True)

	# Create agenteval.json config
	config = {
	"suite_config": {
	"name": "openhands-index",
	"version": "1.0.0-dev1",
	"splits": []
	}
	}

	# Generate data for each benchmark
	for benchmark_name, benchmark_info in BENCHMARKS.items():
	print(f"Generating mock data for {benchmark_name}...")

	# Add to config
	config["suite_config"]["splits"].append({
	"name": benchmark_name,
	"tasks": [{
	"name": benchmark_name,
	"tags": benchmark_info["tags"]
	}]
	})

	# Generate JSONL file
	jsonl_path = output_dir / f"{benchmark_name}.jsonl"
	with open(jsonl_path, 'w') as f:
	for agent in MOCK_AGENTS:
	record = {
	"agent_name": agent["agent_name"],
	"llm_base": agent["llm_base"],
	"openness": agent["openness"],
	"score": agent["scores"][benchmark_name],
	"metric": benchmark_info["metric"],
	"submission_time": datetime.now().isoformat(),
	"tags": benchmark_info["tags"],
	# Additional metadata
	"total_cost": round(10.0 + agent["scores"][benchmark_name] * 0.5, 2),
	"total_runtime": round(300 + agent["scores"][benchmark_name] * 5, 1),
	}
	f.write(json.dumps(record) + '\n')

	print(f" Created {jsonl_path}")

	# Write config file
	config_path = output_dir / "agenteval.json"
	with open(config_path, 'w') as f:
	json.dump(config, f, indent=2)
	print(f"\nCreated config: {config_path}")

	print("\n✓ Mock data generation complete!")
	print(f" Location: {output_dir}")
	print(f" Benchmarks: {', '.join(BENCHMARKS.keys())}")
	print(f" Agents: {len(MOCK_AGENTS)}")


	if __name__ == "__main__":
	generate_mock_data()