Spaces:

OpenHands
/

openhands-index

Running

openhands-index / setup_data.py

openhands

Add debug logging to track data loading on HuggingFace Space

044cdf4 19 days ago

5.99 kB

	"""
	Setup script to fetch data from GitHub repository or use mock data as fallback.
	This runs before the app starts to ensure data is available.
	"""
	import os
	import shutil
	import subprocess
	from pathlib import Path
	from config import DATA_DIR, EXTRACTED_DATA_DIR, CONFIG_NAME

	GITHUB_REPO = "https://github.com/OpenHands/openhands-index-results.git"

	def fetch_data_from_github():
	"""
	Fetch data from the openhands-index-results GitHub repository.
	Returns True if successful, False otherwise.
	"""
	temp_clone_dir = Path("/tmp/openhands-index-results-clone")
	target_dir = Path(EXTRACTED_DATA_DIR) / CONFIG_NAME

	try:
	# Remove existing temp directory if it exists
	if temp_clone_dir.exists():
	shutil.rmtree(temp_clone_dir)

	print(f"Attempting to clone data from {GITHUB_REPO}...")

	# Set environment to prevent git from prompting for credentials
	env = os.environ.copy()
	env['GIT_TERMINAL_PROMPT'] = '0'

	# Clone the repository
	result = subprocess.run(
	["git", "clone", "--depth", "1", GITHUB_REPO, str(temp_clone_dir)],
	capture_output=True,
	text=True,
	timeout=30, # Shorter timeout
	stdin=subprocess.DEVNULL, # Prevent any input prompts
	env=env # Use modified environment
	)

	if result.returncode != 0:
	print(f"Git clone failed: {result.stderr}")
	return False

	# Look for data files in the cloned repository
	# Expected structure: openhands-index-results/results/YYYYMMDD_model/
	results_source = temp_clone_dir / "results"

	if not results_source.exists():
	print(f"Results directory not found in repository")
	return False

	# Check if there are any agent result directories
	result_dirs = list(results_source.iterdir())
	if not result_dirs:
	print(f"No agent results found in {results_source}")
	return False

	print(f"Found {len(result_dirs)} agent result directories")

	# Create target directory and copy the results structure
	os.makedirs(target_dir.parent, exist_ok=True)
	if target_dir.exists():
	shutil.rmtree(target_dir)

	# Copy the entire results directory
	target_results = target_dir / "results"
	shutil.copytree(results_source, target_results)

	print(f"Successfully fetched data from GitHub. Files: {list(target_dir.glob('*'))}")

	# Verify data integrity by checking a sample agent
	sample_agents = list(target_results.glob("*/scores.json"))
	if sample_agents:
	import json
	with open(sample_agents[0]) as f:
	sample_data = json.load(f)
	print(f"Sample data from {sample_agents[0].parent.name}: {sample_data[0] if sample_data else 'EMPTY'}")

	return True

	except subprocess.TimeoutExpired:
	print("Git clone timed out")
	return False
	except Exception as e:
	print(f"Error fetching data from GitHub: {e}")
	return False
	finally:
	# Cleanup temp directory
	if temp_clone_dir.exists():
	shutil.rmtree(temp_clone_dir)

	def copy_mock_data():
	"""Copy mock data to the expected extraction directory."""
	# Try both relative and absolute paths
	mock_source = Path("mock_results") / CONFIG_NAME
	if not mock_source.exists():
	# Try absolute path in case we're in a different working directory
	mock_source = Path("/app/mock_results") / CONFIG_NAME

	target_dir = Path(EXTRACTED_DATA_DIR) / CONFIG_NAME

	print(f"Current working directory: {os.getcwd()}")
	print(f"Looking for mock data at: {mock_source.absolute()}")

	if not mock_source.exists():
	print(f"ERROR: Mock data directory {mock_source} not found!")
	print(f"Directory contents: {list(Path('.').glob('*'))}")
	return False

	# Create target directory
	os.makedirs(target_dir.parent, exist_ok=True)

	# Copy mock data
	print(f"Using mock data from {mock_source}")
	if target_dir.exists():
	shutil.rmtree(target_dir)
	shutil.copytree(mock_source, target_dir)

	# Verify the copy
	copied_files = list(target_dir.glob('*'))
	print(f"Mock data copied successfully. Files: {copied_files}")
	print(f"Target directory: {target_dir.absolute()}")
	return True

	def setup_mock_data():
	"""
	Setup data for the leaderboard.
	First tries to fetch from GitHub, falls back to mock data if unavailable.
	"""
	print("=" * 60)
	print("STARTING DATA SETUP")
	print("=" * 60)

	target_dir = Path(EXTRACTED_DATA_DIR) / CONFIG_NAME

	# Check if data already exists
	if target_dir.exists() and any(target_dir.glob("*.jsonl")):
	jsonl_files = list(target_dir.glob("*.jsonl"))
	print(f"Data already exists at {target_dir}")
	print(f"Found {len(jsonl_files)} JSONL files: {[f.name for f in jsonl_files]}")
	return

	# Try to fetch from GitHub first
	print("\n--- Attempting to fetch from GitHub ---")
	try:
	if fetch_data_from_github():
	print("✓ Successfully using data from GitHub repository")
	return
	except Exception as e:
	print(f"GitHub fetch failed with error: {e}")

	# Fall back to mock data
	print("\n--- GitHub data not available, falling back to mock data ---")
	try:
	if copy_mock_data():
	print("✓ Successfully using mock data")
	return
	except Exception as e:
	print(f"Mock data copy failed with error: {e}")

	print("\n" + "!" * 60)
	print("ERROR: No data available! Neither GitHub nor mock data could be loaded.")
	print("!" * 60)

	if __name__ == "__main__":
	setup_mock_data()