cosmos-predict2-space / handler.py
samwell's picture
Fix CUDA device mismatch - resize image and add autocast
5cf4cc1 verified
import torch
import base64
import io
import os
from typing import Optional
from PIL import Image
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
app = FastAPI()
# Global pipeline
pipe = None
export_to_video = None
class InferenceRequest(BaseModel):
image: str # base64 or URL
prompt: str
negative_prompt: str = "ugly, static, blurry, low quality"
num_frames: int = 93
num_inference_steps: int = 35
guidance_scale: float = 7.0
seed: Optional[int] = None
@app.on_event("startup")
async def load_model():
global pipe, export_to_video
from diffusers import Cosmos2VideoToWorldPipeline
from diffusers.utils import export_to_video as etv
export_to_video = etv
model_id = "nvidia/Cosmos-Predict2-2B-Video2World"
print("Loading model...")
pipe = Cosmos2VideoToWorldPipeline.from_pretrained(
model_id,
torch_dtype=torch.bfloat16,
token=os.environ.get("HF_TOKEN"),
)
pipe.to("cuda")
print("Model loaded successfully!")
@app.post("/predict")
@app.post("/")
async def predict(request: dict):
global pipe, export_to_video
# Handle both direct and nested input formats
inputs = request.get("inputs", request)
image_data = inputs.get("image")
if not image_data:
raise HTTPException(status_code=400, detail="No image provided")
prompt = inputs.get("prompt", "")
if not prompt:
raise HTTPException(status_code=400, detail="No prompt provided")
# Load image
try:
if image_data.startswith("http"):
from diffusers.utils import load_image
image = load_image(image_data)
else:
image_bytes = base64.b64decode(image_data)
image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
# Resize to expected dimensions for Cosmos Video2World
image = image.resize((1280, 704))
except Exception as e:
raise HTTPException(status_code=400, detail=f"Failed to load image: {str(e)}")
negative_prompt = inputs.get("negative_prompt", "ugly, static, blurry, low quality")
num_frames = inputs.get("num_frames", 93)
num_inference_steps = inputs.get("num_inference_steps", 35)
guidance_scale = inputs.get("guidance_scale", 7.0)
seed = inputs.get("seed")
# Create generator on correct device
generator = None
if seed is not None:
generator = torch.Generator(device="cuda").manual_seed(int(seed))
try:
with torch.cuda.amp.autocast(dtype=torch.bfloat16):
output = pipe(
image=image,
prompt=prompt,
negative_prompt=negative_prompt,
num_frames=num_frames,
num_inference_steps=num_inference_steps,
guidance_scale=guidance_scale,
generator=generator,
)
video_path = "/tmp/output.mp4"
export_to_video(output.frames[0], video_path, fps=16)
with open(video_path, "rb") as f:
video_b64 = base64.b64encode(f.read()).decode("utf-8")
return {"video": video_b64, "content_type": "video/mp4"}
except Exception as e:
import traceback
traceback.print_exc()
raise HTTPException(status_code=500, detail=f"Inference failed: {str(e)}")
@app.get("/health")
@app.get("/")
async def health():
return {"status": "healthy", "message": "Cosmos-Predict2 Video2World API"}