Spaces:

OpenHands
/

openhands-index

Running

openhands-index / generate_mock_jsonl.py

openhands

Update UI: All-Hands-AI color scheme, agent version column names, and OpenHands logo

0ee2099 20 days ago

5.3 kB

	"""Generate mock results data in JSONL format for OpenHands Index."""
	import json
	import os
	from pathlib import Path
	from datetime import datetime

	# Define the 6 benchmarks
	BENCHMARKS = {
	"swe-bench": {
	"tags": ["swe-bench"],
	"metric": "resolve_rate",
	"metric_display": "Resolve Rate (%)"
	},
	"multi-swe-bench": {
	"tags": ["multi-swe-bench"],
	"metric": "resolve_rate",
	"metric_display": "Resolve Rate (%)"
	},
	"swe-bench-multimodal": {
	"tags": ["swe-bench-multimodal"],
	"metric": "resolve_rate",
	"metric_display": "Resolve Rate (%)"
	},
	"swt-bench": {
	"tags": ["swt-bench"],
	"metric": "success_rate",
	"metric_display": "Success Rate (%)"
	},
	"commit0": {
	"tags": ["commit0"],
	"metric": "test_pass_rate",
	"metric_display": "Test Pass Rate (%)"
	},
	"gaia": {
	"tags": ["gaia"],
	"metric": "accuracy",
	"metric_display": "Accuracy (%)"
	}
	}

	# Mock agents with realistic scores
	MOCK_AGENTS = [
	{
	"agent_name": "1.0.2",
	"llm_base": "claude-3-5-sonnet-20241022",
	"openness": "closed_api_available",
	"tool_usage": "standard",
	"scores": {
	"swe-bench": 48.3,
	"multi-swe-bench": 35.2,
	"swe-bench-multimodal": 42.1,
	"swt-bench": 65.4,
	"commit0": 71.2,
	"gaia": 58.7
	}
	},
	{
	"agent_name": "1.0.1",
	"llm_base": "gpt-4o-2024-11-20",
	"openness": "closed_api_available",
	"tool_usage": "standard",
	"scores": {
	"swe-bench": 45.1,
	"multi-swe-bench": 32.8,
	"swe-bench-multimodal": 39.5,
	"swt-bench": 62.3,
	"commit0": 68.9,
	"gaia": 55.2
	}
	},
	{
	"agent_name": "1.0.0",
	"llm_base": "gpt-4-turbo-2024-04-09",
	"openness": "closed_api_available",
	"tool_usage": "standard",
	"scores": {
	"swe-bench": 38.7,
	"multi-swe-bench": 28.4,
	"swe-bench-multimodal": 34.2,
	"swt-bench": 54.1,
	"commit0": 61.5,
	"gaia": 48.3
	}
	},
	{
	"agent_name": "0.9.5",
	"llm_base": "gpt-4o-mini-2024-07-18",
	"openness": "closed_api_available",
	"tool_usage": "standard",
	"scores": {
	"swe-bench": 32.5,
	"multi-swe-bench": 24.1,
	"swe-bench-multimodal": 28.9,
	"swt-bench": 47.8,
	"commit0": 55.3,
	"gaia": 42.1
	}
	},
	{
	"agent_name": "0.9.0",
	"llm_base": "claude-3-opus-20240229",
	"openness": "closed_api_available",
	"tool_usage": "custom_interface",
	"scores": {
	"swe-bench": 29.8,
	"multi-swe-bench": 21.5,
	"swe-bench-multimodal": 25.7,
	"swt-bench": 44.2,
	"commit0": 52.1,
	"gaia": 39.4
	}
	},
	]


	def generate_mock_data():
	"""Generate mock JSONL files for all benchmarks."""
	output_dir = Path("mock_results/1.0.0-dev1")
	output_dir.mkdir(parents=True, exist_ok=True)

	# Create agenteval.json config
	config = {
	"suite_config": {
	"name": "openhands-index",
	"version": "1.0.0-dev1",
	"splits": []
	}
	}

	# Generate data for each benchmark
	for benchmark_name, benchmark_info in BENCHMARKS.items():
	print(f"Generating mock data for {benchmark_name}...")

	# Add to config
	config["suite_config"]["splits"].append({
	"name": benchmark_name,
	"tasks": [{
	"name": benchmark_name,
	"tags": benchmark_info["tags"]
	}]
	})

	# Generate JSONL file
	jsonl_path = output_dir / f"{benchmark_name}.jsonl"
	with open(jsonl_path, 'w') as f:
	for agent in MOCK_AGENTS:
	record = {
	"agent_name": agent["agent_name"],
	"llm_base": agent["llm_base"],
	"openness": agent["openness"],
	"tool_usage": agent["tool_usage"],
	"score": agent["scores"][benchmark_name],
	"metric": benchmark_info["metric"],
	"submission_time": datetime.now().isoformat(),
	"tags": benchmark_info["tags"],
	# Additional metadata
	"total_cost": round(10.0 + agent["scores"][benchmark_name] * 0.5, 2),
	"total_runtime": round(300 + agent["scores"][benchmark_name] * 5, 1),
	}
	f.write(json.dumps(record) + '\n')

	print(f" Created {jsonl_path}")

	# Write config file
	config_path = output_dir / "agenteval.json"
	with open(config_path, 'w') as f:
	json.dump(config, f, indent=2)
	print(f"\nCreated config: {config_path}")

	print("\n✓ Mock data generation complete!")
	print(f" Location: {output_dir}")
	print(f" Benchmarks: {', '.join(BENCHMARKS.keys())}")
	print(f" Agents: {len(MOCK_AGENTS)}")


	if __name__ == "__main__":
	generate_mock_data()