Spaces:
Running
on
Zero
Running
on
Zero
debug weight dtype
Browse files- gradio/app.py +6 -0
- inference.py +7 -6
gradio/app.py
CHANGED
|
@@ -71,6 +71,12 @@ with gr.Blocks(css="footer {visibility: hidden}") as demo:
|
|
| 71 |
label="Input image",
|
| 72 |
interactive=True,
|
| 73 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
generate_btn = gr.Button("Generate video", variant="primary")
|
| 75 |
with gr.Column():
|
| 76 |
video_out = gr.Video(
|
|
|
|
| 71 |
label="Input image",
|
| 72 |
interactive=True,
|
| 73 |
)
|
| 74 |
+
tense_choice = gr.Dropdown(
|
| 75 |
+
label="I want to generate the",
|
| 76 |
+
choices=["present", "past, present and future"],
|
| 77 |
+
value="past, present and future", # default selection
|
| 78 |
+
interactive=True,
|
| 79 |
+
)
|
| 80 |
generate_btn = gr.Button("Generate video", variant="primary")
|
| 81 |
with gr.Column():
|
| 82 |
video_out = gr.Video(
|
inference.py
CHANGED
|
@@ -61,7 +61,7 @@ INTERVALS = {
|
|
| 61 |
"mode": "1x",
|
| 62 |
"fps": 240
|
| 63 |
},
|
| 64 |
-
"
|
| 65 |
"in_start": 4,
|
| 66 |
"in_end": 12,
|
| 67 |
"out_start": 0,
|
|
@@ -161,6 +161,10 @@ def load_model(args):
|
|
| 161 |
# as these weights are only used for inference, keeping weights in full precision is not required.
|
| 162 |
weight_dtype = torch.bfloat16
|
| 163 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 164 |
pipe = ControlnetCogVideoXPipeline.from_pretrained(
|
| 165 |
args.pretrained_model_path,
|
| 166 |
tokenizer=tokenizer,
|
|
@@ -241,16 +245,13 @@ def main(args):
|
|
| 241 |
image_paths = [image_path]
|
| 242 |
|
| 243 |
pipe, model_config = load_model(args)
|
| 244 |
-
|
| 245 |
-
# text_encoder.to(args.device, dtype=weight_dtype)
|
| 246 |
-
# transformer.to(args.device, dtype=weight_dtype)
|
| 247 |
-
# vae.to(args.device, dtype=weight_dtype)
|
| 248 |
pipe = pipe.to(args.device)
|
| 249 |
|
| 250 |
for image_path in image_paths:
|
| 251 |
image = Image.open(image_path)
|
| 252 |
|
| 253 |
-
processed_image, video = inference_on_image(pipe, image, "
|
| 254 |
|
| 255 |
vid_output_path = output_path / f"{image_path.stem}.mp4"
|
| 256 |
export_to_video(video, vid_output_path, fps=20)
|
|
|
|
| 61 |
"mode": "1x",
|
| 62 |
"fps": 240
|
| 63 |
},
|
| 64 |
+
"past, present and future": {
|
| 65 |
"in_start": 4,
|
| 66 |
"in_end": 12,
|
| 67 |
"out_start": 0,
|
|
|
|
| 161 |
# as these weights are only used for inference, keeping weights in full precision is not required.
|
| 162 |
weight_dtype = torch.bfloat16
|
| 163 |
|
| 164 |
+
# text_encoder.to(dtype=weight_dtype)
|
| 165 |
+
# transformer.to(dtype=weight_dtype)
|
| 166 |
+
# vae.to(dtype=weight_dtype)
|
| 167 |
+
|
| 168 |
pipe = ControlnetCogVideoXPipeline.from_pretrained(
|
| 169 |
args.pretrained_model_path,
|
| 170 |
tokenizer=tokenizer,
|
|
|
|
| 245 |
image_paths = [image_path]
|
| 246 |
|
| 247 |
pipe, model_config = load_model(args)
|
| 248 |
+
|
|
|
|
|
|
|
|
|
|
| 249 |
pipe = pipe.to(args.device)
|
| 250 |
|
| 251 |
for image_path in image_paths:
|
| 252 |
image = Image.open(image_path)
|
| 253 |
|
| 254 |
+
processed_image, video = inference_on_image(pipe, image, "past, present and future", model_config, args)
|
| 255 |
|
| 256 |
vid_output_path = output_path / f"{image_path.stem}.mp4"
|
| 257 |
export_to_video(video, vid_output_path, fps=20)
|