File size: 5,858 Bytes
a63b4cf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
671ebc9
 
 
 
a63b4cf
 
 
 
 
671ebc9
 
 
a63b4cf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2176460
a63b4cf
2176460
 
 
 
a63b4cf
 
2176460
 
 
a63b4cf
2176460
 
a63b4cf
 
 
 
 
 
 
 
 
 
2176460
 
 
 
 
a63b4cf
 
9da8453
a63b4cf
9da8453
a63b4cf
 
2176460
 
 
 
a63b4cf
 
 
 
2176460
a63b4cf
2176460
a63b4cf
 
 
2176460
9da8453
 
 
 
 
 
a63b4cf
 
2176460
9da8453
 
 
 
 
 
a63b4cf
2176460
a63b4cf
2176460
a63b4cf
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
"""
Setup script to fetch data from GitHub repository or use mock data as fallback.
This runs before the app starts to ensure data is available.
"""
import os
import shutil
import subprocess
from pathlib import Path
from config import DATA_DIR, EXTRACTED_DATA_DIR, CONFIG_NAME

GITHUB_REPO = "https://github.com/OpenHands/openhands-index-results.git"

def fetch_data_from_github():
    """
    Fetch data from the openhands-index-results GitHub repository.
    Returns True if successful, False otherwise.
    """
    temp_clone_dir = Path("/tmp/openhands-index-results-clone")
    target_dir = Path(EXTRACTED_DATA_DIR) / CONFIG_NAME
    
    try:
        # Remove existing temp directory if it exists
        if temp_clone_dir.exists():
            shutil.rmtree(temp_clone_dir)
        
        print(f"Attempting to clone data from {GITHUB_REPO}...")
        
        # Set environment to prevent git from prompting for credentials
        env = os.environ.copy()
        env['GIT_TERMINAL_PROMPT'] = '0'
        
        # Clone the repository
        result = subprocess.run(
            ["git", "clone", "--depth", "1", GITHUB_REPO, str(temp_clone_dir)],
            capture_output=True,
            text=True,
            timeout=30,  # Shorter timeout
            stdin=subprocess.DEVNULL,  # Prevent any input prompts
            env=env  # Use modified environment
        )
        
        if result.returncode != 0:
            print(f"Git clone failed: {result.stderr}")
            return False
        
        # Look for data files in the cloned repository
        # Expected structure: openhands-index-results/{version}/test.jsonl, validation.jsonl, etc.
        data_source = temp_clone_dir / CONFIG_NAME
        
        if not data_source.exists():
            print(f"Data directory {data_source} not found in repository")
            # Try to find any version directories
            version_dirs = list(temp_clone_dir.glob("*.*.*"))
            if version_dirs:
                print(f"Found version directories: {[d.name for d in version_dirs]}")
                # Use the first available version
                data_source = version_dirs[0]
                print(f"Using data from {data_source.name}")
            else:
                print("No data found in repository")
                return False
        
        # Check if there are any JSONL files
        jsonl_files = list(data_source.glob("*.jsonl"))
        if not jsonl_files:
            print(f"No JSONL files found in {data_source}")
            return False
        
        # Create target directory and copy data
        os.makedirs(target_dir.parent, exist_ok=True)
        if target_dir.exists():
            shutil.rmtree(target_dir)
        shutil.copytree(data_source, target_dir)
        
        print(f"Successfully fetched data from GitHub. Files: {list(target_dir.glob('*'))}")
        return True
        
    except subprocess.TimeoutExpired:
        print("Git clone timed out")
        return False
    except Exception as e:
        print(f"Error fetching data from GitHub: {e}")
        return False
    finally:
        # Cleanup temp directory
        if temp_clone_dir.exists():
            shutil.rmtree(temp_clone_dir)

def copy_mock_data():
    """Copy mock data to the expected extraction directory."""
    # Try both relative and absolute paths
    mock_source = Path("mock_results") / CONFIG_NAME
    if not mock_source.exists():
        # Try absolute path in case we're in a different working directory
        mock_source = Path("/app/mock_results") / CONFIG_NAME
    
    target_dir = Path(EXTRACTED_DATA_DIR) / CONFIG_NAME
    
    print(f"Current working directory: {os.getcwd()}")
    print(f"Looking for mock data at: {mock_source.absolute()}")
    
    if not mock_source.exists():
        print(f"ERROR: Mock data directory {mock_source} not found!")
        print(f"Directory contents: {list(Path('.').glob('*'))}")
        return False
    
    # Create target directory
    os.makedirs(target_dir.parent, exist_ok=True)
    
    # Copy mock data
    print(f"Using mock data from {mock_source}")
    if target_dir.exists():
        shutil.rmtree(target_dir)
    shutil.copytree(mock_source, target_dir)
    
    # Verify the copy
    copied_files = list(target_dir.glob('*'))
    print(f"Mock data copied successfully. Files: {copied_files}")
    print(f"Target directory: {target_dir.absolute()}")
    return True

def setup_mock_data():
    """
    Setup data for the leaderboard.
    First tries to fetch from GitHub, falls back to mock data if unavailable.
    """
    print("=" * 60)
    print("STARTING DATA SETUP")
    print("=" * 60)
    
    target_dir = Path(EXTRACTED_DATA_DIR) / CONFIG_NAME
    
    # Check if data already exists
    if target_dir.exists() and any(target_dir.glob("*.jsonl")):
        jsonl_files = list(target_dir.glob("*.jsonl"))
        print(f"Data already exists at {target_dir}")
        print(f"Found {len(jsonl_files)} JSONL files: {[f.name for f in jsonl_files]}")
        return
    
    # Try to fetch from GitHub first
    print("\n--- Attempting to fetch from GitHub ---")
    try:
        if fetch_data_from_github():
            print("✓ Successfully using data from GitHub repository")
            return
    except Exception as e:
        print(f"GitHub fetch failed with error: {e}")
    
    # Fall back to mock data
    print("\n--- GitHub data not available, falling back to mock data ---")
    try:
        if copy_mock_data():
            print("✓ Successfully using mock data")
            return
    except Exception as e:
        print(f"Mock data copy failed with error: {e}")
    
    print("\n" + "!" * 60)
    print("ERROR: No data available! Neither GitHub nor mock data could be loaded.")
    print("!" * 60)

if __name__ == "__main__":
    setup_mock_data()