Spaces:
Running
Running
| """ | |
| Setup script to fetch data from GitHub repository or use mock data as fallback. | |
| This runs before the app starts to ensure data is available. | |
| """ | |
| import os | |
| import shutil | |
| import subprocess | |
| from pathlib import Path | |
| from config import DATA_DIR, EXTRACTED_DATA_DIR, CONFIG_NAME | |
| GITHUB_REPO = "https://github.com/OpenHands/openhands-index-results.git" | |
| def fetch_data_from_github(): | |
| """ | |
| Fetch data from the openhands-index-results GitHub repository. | |
| Returns True if successful, False otherwise. | |
| """ | |
| temp_clone_dir = Path("/tmp/openhands-index-results-clone") | |
| target_dir = Path(EXTRACTED_DATA_DIR) / CONFIG_NAME | |
| try: | |
| # Remove existing temp directory if it exists | |
| if temp_clone_dir.exists(): | |
| shutil.rmtree(temp_clone_dir) | |
| print(f"Attempting to clone data from {GITHUB_REPO}...") | |
| # Set environment to prevent git from prompting for credentials | |
| env = os.environ.copy() | |
| env['GIT_TERMINAL_PROMPT'] = '0' | |
| # Clone the repository | |
| result = subprocess.run( | |
| ["git", "clone", "--depth", "1", GITHUB_REPO, str(temp_clone_dir)], | |
| capture_output=True, | |
| text=True, | |
| timeout=30, # Shorter timeout | |
| stdin=subprocess.DEVNULL, # Prevent any input prompts | |
| env=env # Use modified environment | |
| ) | |
| if result.returncode != 0: | |
| print(f"Git clone failed: {result.stderr}") | |
| return False | |
| # Look for data files in the cloned repository | |
| # Expected structure: openhands-index-results/{version}/test.jsonl, validation.jsonl, etc. | |
| data_source = temp_clone_dir / CONFIG_NAME | |
| if not data_source.exists(): | |
| print(f"Data directory {data_source} not found in repository") | |
| # Try to find any version directories | |
| version_dirs = list(temp_clone_dir.glob("*.*.*")) | |
| if version_dirs: | |
| print(f"Found version directories: {[d.name for d in version_dirs]}") | |
| # Use the first available version | |
| data_source = version_dirs[0] | |
| print(f"Using data from {data_source.name}") | |
| else: | |
| print("No data found in repository") | |
| return False | |
| # Check if there are any JSONL files | |
| jsonl_files = list(data_source.glob("*.jsonl")) | |
| if not jsonl_files: | |
| print(f"No JSONL files found in {data_source}") | |
| return False | |
| # Create target directory and copy data | |
| os.makedirs(target_dir.parent, exist_ok=True) | |
| if target_dir.exists(): | |
| shutil.rmtree(target_dir) | |
| shutil.copytree(data_source, target_dir) | |
| print(f"Successfully fetched data from GitHub. Files: {list(target_dir.glob('*'))}") | |
| return True | |
| except subprocess.TimeoutExpired: | |
| print("Git clone timed out") | |
| return False | |
| except Exception as e: | |
| print(f"Error fetching data from GitHub: {e}") | |
| return False | |
| finally: | |
| # Cleanup temp directory | |
| if temp_clone_dir.exists(): | |
| shutil.rmtree(temp_clone_dir) | |
| def copy_mock_data(): | |
| """Copy mock data to the expected extraction directory.""" | |
| # Try both relative and absolute paths | |
| mock_source = Path("mock_results") / CONFIG_NAME | |
| if not mock_source.exists(): | |
| # Try absolute path in case we're in a different working directory | |
| mock_source = Path("/app/mock_results") / CONFIG_NAME | |
| target_dir = Path(EXTRACTED_DATA_DIR) / CONFIG_NAME | |
| print(f"Current working directory: {os.getcwd()}") | |
| print(f"Looking for mock data at: {mock_source.absolute()}") | |
| if not mock_source.exists(): | |
| print(f"ERROR: Mock data directory {mock_source} not found!") | |
| print(f"Directory contents: {list(Path('.').glob('*'))}") | |
| return False | |
| # Create target directory | |
| os.makedirs(target_dir.parent, exist_ok=True) | |
| # Copy mock data | |
| print(f"Using mock data from {mock_source}") | |
| if target_dir.exists(): | |
| shutil.rmtree(target_dir) | |
| shutil.copytree(mock_source, target_dir) | |
| # Verify the copy | |
| copied_files = list(target_dir.glob('*')) | |
| print(f"Mock data copied successfully. Files: {copied_files}") | |
| print(f"Target directory: {target_dir.absolute()}") | |
| return True | |
| def setup_mock_data(): | |
| """ | |
| Setup data for the leaderboard. | |
| First tries to fetch from GitHub, falls back to mock data if unavailable. | |
| """ | |
| print("=" * 60) | |
| print("STARTING DATA SETUP") | |
| print("=" * 60) | |
| target_dir = Path(EXTRACTED_DATA_DIR) / CONFIG_NAME | |
| # Check if data already exists | |
| if target_dir.exists() and any(target_dir.glob("*.jsonl")): | |
| jsonl_files = list(target_dir.glob("*.jsonl")) | |
| print(f"Data already exists at {target_dir}") | |
| print(f"Found {len(jsonl_files)} JSONL files: {[f.name for f in jsonl_files]}") | |
| return | |
| # Try to fetch from GitHub first | |
| print("\n--- Attempting to fetch from GitHub ---") | |
| try: | |
| if fetch_data_from_github(): | |
| print("✓ Successfully using data from GitHub repository") | |
| return | |
| except Exception as e: | |
| print(f"GitHub fetch failed with error: {e}") | |
| # Fall back to mock data | |
| print("\n--- GitHub data not available, falling back to mock data ---") | |
| try: | |
| if copy_mock_data(): | |
| print("✓ Successfully using mock data") | |
| return | |
| except Exception as e: | |
| print(f"Mock data copy failed with error: {e}") | |
| print("\n" + "!" * 60) | |
| print("ERROR: No data available! Neither GitHub nor mock data could be loaded.") | |
| print("!" * 60) | |
| if __name__ == "__main__": | |
| setup_mock_data() | |