Spaces:

dks1
/

tempoPFN

Sleeping

App Files Files Community

altpuppet commited on Nov 22

Commit

6cc66f0

1 Parent(s): 055af68

Fix ZeroGPU timeout issue - extend duration and optimize model loading

Browse files

Files changed (1) hide show

app.py +16 -7

app.py CHANGED Viewed

@@ -457,26 +457,29 @@ def create_gradio_app():
         return metrics
-    @spaces.GPU
     def run_gpu_inference(history_values_tensor, future_values_tensor, start, freq_object):
         """
         GPU-only inference function for ZeroGPU Spaces.
         ALL CUDA operations must happen inside this decorated function.
         """
         global model
-        # Load model once on first call
         if model is None:
             print("--- Loading TempoPFN model for the first time ---")
-            device = torch.device("cuda:0")
             print(f"Downloading model...")
             model_path = hf_hub_download(repo_id="AutoML-org/TempoPFN", filename="models/checkpoint_38M.pth")
-            print(f"Loading model from {model_path} to {device}...")
-            model = load_model(config_path="configs/example.yaml", model_path=model_path, device=device)
-            print("--- Model loaded successfully ---")
-        # Move tensors to GPU inside the decorated function
         device = torch.device("cuda:0")
         # Prepare container with GPU tensors
         container = BatchTimeSeriesContainer(
@@ -487,9 +490,14 @@ def create_gradio_app():
         )
         # Run inference with bfloat16 autocast
         with torch.no_grad(), torch.autocast(device_type="cuda", dtype=torch.bfloat16, enabled=True):
             model_output = model(container)
         return model_output
     def forecast_time_series(data_source, stock_ticker, uploaded_file, forecast_horizon, history_length, seed, synth_generator="Sine Waves", synth_complexity=5):
@@ -760,6 +768,7 @@ def create_gradio_app():
     with gr.Blocks(title="TempoPFN") as app:
         gr.Markdown("# TempoPFN\n### Zero-Shot Forecasting & Analysis Terminal\n*Powered by synthetic pre-training • Forecast anything, anywhere*")
         with gr.Tabs() as tabs:

         return metrics
+    @spaces.GPU(duration=120)  # Extend timeout to 120 seconds for first-run compilation
     def run_gpu_inference(history_values_tensor, future_values_tensor, start, freq_object):
         """
         GPU-only inference function for ZeroGPU Spaces.
         ALL CUDA operations must happen inside this decorated function.
+        Extended timeout for Triton kernel compilation on first run.
         """
         global model
+        # Load model once on first call (on CPU first to save GPU time)
         if model is None:
             print("--- Loading TempoPFN model for the first time ---")
             print(f"Downloading model...")
             model_path = hf_hub_download(repo_id="AutoML-org/TempoPFN", filename="models/checkpoint_38M.pth")
+            # Load on CPU first to save GPU allocation time
+            print(f"Loading model from {model_path} to CPU first...")
+            model = load_model(config_path="configs/example.yaml", model_path=model_path, device=torch.device("cpu"))
+            print("--- Model loaded successfully on CPU ---")
+        # Move model to GPU inside the decorated function
         device = torch.device("cuda:0")
+        print(f"Moving model to {device}...")
+        model.to(device)
         # Prepare container with GPU tensors
         container = BatchTimeSeriesContainer(
         )
         # Run inference with bfloat16 autocast
+        print("Running inference...")
         with torch.no_grad(), torch.autocast(device_type="cuda", dtype=torch.bfloat16, enabled=True):
             model_output = model(container)
+        # Move model back to CPU to free GPU memory
+        model.to(torch.device("cpu"))
+        print("Inference complete, model moved back to CPU")
         return model_output
     def forecast_time_series(data_source, stock_ticker, uploaded_file, forecast_horizon, history_length, seed, synth_generator="Sine Waves", synth_complexity=5):
     with gr.Blocks(title="TempoPFN") as app:
         gr.Markdown("# TempoPFN\n### Zero-Shot Forecasting & Analysis Terminal\n*Powered by synthetic pre-training • Forecast anything, anywhere*")
+        gr.Markdown("⚠️ **First Run Note**: Initial inference may take 60-90 seconds due to Triton kernel compilation. Subsequent runs will be much faster!")
         with gr.Tabs() as tabs: