Spaces:

tedlasai
/

blur2vid

Running on Zero

App Files Files Community

ftaubner commited on 25 days ago

Commit

7245cc5

1 Parent(s): bbe1648

initial commit

Browse files

Files changed (37) hide show

LICENSE +201 -0
cogvideo_embeddings.py +0 -0
cogvideo_transformer.py +547 -0
controlnet_pipeline.py +733 -0
extra/checkpoints_to_hf.py +16 -0
extra/moMets-parallel-baist.py +330 -0
extra/moMets-parallel-gopro.py +343 -0
gradio/app.py +85 -0
inference.py +317 -0
requirements.txt +22 -0
setup/download_checkpoints.py +53 -0
setup/download_cogvideo_weights.py +6 -0
setup/environment.yaml +225 -0
training/accelerator_configs/accelerate_test.py +17 -0
training/accelerator_configs/accelerator_multigpu.yaml +6 -0
training/accelerator_configs/accelerator_multinode.yaml +4 -0
training/accelerator_configs/accelerator_singlegpu.yaml +25 -0
training/accelerator_configs/accelerator_val_config.yaml +25 -0
training/available-qos.txt +10 -0
training/configs/baist_test.yaml +77 -0
training/configs/baist_train.yaml +78 -0
training/configs/full_test.yaml +78 -0
training/configs/full_train.yaml +78 -0
training/configs/gopro_2x_test.yaml +78 -0
training/configs/gopro_test.yaml +78 -0
training/configs/gopro_train.yaml +77 -0
training/configs/outsidephotos.yaml +76 -0
training/controlnet_datasets.py +735 -0
training/helpers.py +533 -0
training/slurm_scripts/simple_multinode.sbatch +88 -0
training/slurm_scripts/slurm-bash.sh +1 -0
training/slurm_scripts/train.sbatch +54 -0
training/slurm_scripts/val.sbatch +50 -0
training/test_dataset.py +0 -0
training/train_controlnet.py +724 -0
training/train_controlnet_backup.py +1235 -0
training/utils.py +299 -0

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+  Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright 2024 CogVideo Model Team @ Zhipu AI
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

cogvideo_embeddings.py ADDED Viewed

The diff for this file is too large to render. See raw diff

cogvideo_transformer.py ADDED Viewed

	@@ -0,0 +1,547 @@

+# Copyright 2024 The CogVideoX team, Tsinghua University & ZhipuAI and The HuggingFace Team.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any, Dict, Optional, Tuple, Union
+import torch
+from torch import nn
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.loaders import PeftAdapterMixin
+from diffusers.utils import USE_PEFT_BACKEND, is_torch_version, logging, scale_lora_layers, unscale_lora_layers
+from diffusers.utils.torch_utils import maybe_allow_in_graph
+from diffusers.models.attention import Attention, FeedForward
+from diffusers.models.attention_processor import AttentionProcessor, CogVideoXAttnProcessor2_0, FusedCogVideoXAttnProcessor2_0
+#from diffusers.models.embeddings import CogVideoXPatchEmbed, TimestepEmbedding, Timesteps
+from cogvideo_embeddings import CogVideoXPatchEmbedWBlur, TimestepEmbedding, Timesteps
+from diffusers.models.modeling_outputs import Transformer2DModelOutput
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.models.normalization import AdaLayerNorm, CogVideoXLayerNormZero
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+@maybe_allow_in_graph
+class CogVideoXBlock(nn.Module):
+    r"""
+    Transformer block used in [CogVideoX](https://github.com/THUDM/CogVideo) model.
+    Parameters:
+        dim (`int`):
+            The number of channels in the input and output.
+        num_attention_heads (`int`):
+            The number of heads to use for multi-head attention.
+        attention_head_dim (`int`):
+            The number of channels in each head.
+        time_embed_dim (`int`):
+            The number of channels in timestep embedding.
+        dropout (`float`, defaults to `0.0`):
+            The dropout probability to use.
+        activation_fn (`str`, defaults to `"gelu-approximate"`):
+            Activation function to be used in feed-forward.
+        attention_bias (`bool`, defaults to `False`):
+            Whether or not to use bias in attention projection layers.
+        qk_norm (`bool`, defaults to `True`):
+            Whether or not to use normalization after query and key projections in Attention.
+        norm_elementwise_affine (`bool`, defaults to `True`):
+            Whether to use learnable elementwise affine parameters for normalization.
+        norm_eps (`float`, defaults to `1e-5`):
+            Epsilon value for normalization layers.
+        final_dropout (`bool` defaults to `False`):
+            Whether to apply a final dropout after the last feed-forward layer.
+        ff_inner_dim (`int`, *optional*, defaults to `None`):
+            Custom hidden dimension of Feed-forward layer. If not provided, `4 * dim` is used.
+        ff_bias (`bool`, defaults to `True`):
+            Whether or not to use bias in Feed-forward layer.
+        attention_out_bias (`bool`, defaults to `True`):
+            Whether or not to use bias in Attention output projection layer.
+    """
+    def __init__(
+        self,
+        dim: int,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        time_embed_dim: int,
+        dropout: float = 0.0,
+        activation_fn: str = "gelu-approximate",
+        attention_bias: bool = False,
+        qk_norm: bool = True,
+        norm_elementwise_affine: bool = True,
+        norm_eps: float = 1e-5,
+        final_dropout: bool = True,
+        ff_inner_dim: Optional[int] = None,
+        ff_bias: bool = True,
+        attention_out_bias: bool = True,
+    ):
+        super().__init__()
+        # 1. Self Attention
+        self.norm1 = CogVideoXLayerNormZero(time_embed_dim, dim, norm_elementwise_affine, norm_eps, bias=True)
+        self.attn1 = Attention(
+            query_dim=dim,
+            dim_head=attention_head_dim,
+            heads=num_attention_heads,
+            qk_norm="layer_norm" if qk_norm else None,
+            eps=1e-6,
+            bias=attention_bias,
+            out_bias=attention_out_bias,
+            processor=CogVideoXAttnProcessor2_0(),
+        )
+        # 2. Feed Forward
+        self.norm2 = CogVideoXLayerNormZero(time_embed_dim, dim, norm_elementwise_affine, norm_eps, bias=True)
+        self.ff = FeedForward(
+            dim,
+            dropout=dropout,
+            activation_fn=activation_fn,
+            final_dropout=final_dropout,
+            inner_dim=ff_inner_dim,
+            bias=ff_bias,
+        )
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        temb: torch.Tensor,
+        image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+    ) -> torch.Tensor:
+        text_seq_length = encoder_hidden_states.size(1)
+        # norm & modulate
+        norm_hidden_states, norm_encoder_hidden_states, gate_msa, enc_gate_msa = self.norm1(
+            hidden_states, encoder_hidden_states, temb
+        )
+        # attention
+        attn_hidden_states, attn_encoder_hidden_states = self.attn1(
+            hidden_states=norm_hidden_states,
+            encoder_hidden_states=norm_encoder_hidden_states,
+            image_rotary_emb=image_rotary_emb,
+        )
+        hidden_states = hidden_states + gate_msa * attn_hidden_states
+        encoder_hidden_states = encoder_hidden_states + enc_gate_msa * attn_encoder_hidden_states
+        # norm & modulate
+        norm_hidden_states, norm_encoder_hidden_states, gate_ff, enc_gate_ff = self.norm2(
+            hidden_states, encoder_hidden_states, temb
+        )
+        # feed-forward
+        norm_hidden_states = torch.cat([norm_encoder_hidden_states, norm_hidden_states], dim=1)
+        ff_output = self.ff(norm_hidden_states)
+        hidden_states = hidden_states + gate_ff * ff_output[:, text_seq_length:]
+        encoder_hidden_states = encoder_hidden_states + enc_gate_ff * ff_output[:, :text_seq_length]
+        return hidden_states, encoder_hidden_states
+class CogVideoXTransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
+    """
+    A Transformer model for video-like data in [CogVideoX](https://github.com/THUDM/CogVideo).
+    Parameters:
+        num_attention_heads (`int`, defaults to `30`):
+            The number of heads to use for multi-head attention.
+        attention_head_dim (`int`, defaults to `64`):
+            The number of channels in each head.
+        in_channels (`int`, defaults to `16`):
+            The number of channels in the input.
+        out_channels (`int`, *optional*, defaults to `16`):
+            The number of channels in the output.
+        flip_sin_to_cos (`bool`, defaults to `True`):
+            Whether to flip the sin to cos in the time embedding.
+        time_embed_dim (`int`, defaults to `512`):
+            Output dimension of timestep embeddings.
+        ofs_embed_dim (`int`, defaults to `512`):
+            Output dimension of "ofs" embeddings used in CogVideoX-5b-I2B in version 1.5
+        text_embed_dim (`int`, defaults to `4096`):
+            Input dimension of text embeddings from the text encoder.
+        num_layers (`int`, defaults to `30`):
+            The number of layers of Transformer blocks to use.
+        dropout (`float`, defaults to `0.0`):
+            The dropout probability to use.
+        attention_bias (`bool`, defaults to `True`):
+            Whether to use bias in the attention projection layers.
+        sample_width (`int`, defaults to `90`):
+            The width of the input latents.
+        sample_height (`int`, defaults to `60`):
+            The height of the input latents.
+        sample_frames (`int`, defaults to `49`):
+            The number of frames in the input latents. Note that this parameter was incorrectly initialized to 49
+            instead of 13 because CogVideoX processed 13 latent frames at once in its default and recommended settings,
+            but cannot be changed to the correct value to ensure backwards compatibility. To create a transformer with
+            K latent frames, the correct value to pass here would be: ((K - 1) * temporal_compression_ratio + 1).
+        patch_size (`int`, defaults to `2`):
+            The size of the patches to use in the patch embedding layer.
+        temporal_compression_ratio (`int`, defaults to `4`):
+            The compression ratio across the temporal dimension. See documentation for `sample_frames`.
+        max_text_seq_length (`int`, defaults to `226`):
+            The maximum sequence length of the input text embeddings.
+        activation_fn (`str`, defaults to `"gelu-approximate"`):
+            Activation function to use in feed-forward.
+        timestep_activation_fn (`str`, defaults to `"silu"`):
+            Activation function to use when generating the timestep embeddings.
+        norm_elementwise_affine (`bool`, defaults to `True`):
+            Whether to use elementwise affine in normalization layers.
+        norm_eps (`float`, defaults to `1e-5`):
+            The epsilon value to use in normalization layers.
+        spatial_interpolation_scale (`float`, defaults to `1.875`):
+            Scaling factor to apply in 3D positional embeddings across spatial dimensions.
+        temporal_interpolation_scale (`float`, defaults to `1.0`):
+            Scaling factor to apply in 3D positional embeddings across temporal dimensions.
+    """
+    _supports_gradient_checkpointing = True
+    @register_to_config
+    def __init__(
+        self,
+        num_attention_heads: int = 30,
+        attention_head_dim: int = 64,
+        in_channels: int = 16,
+        out_channels: Optional[int] = 16,
+        flip_sin_to_cos: bool = True,
+        freq_shift: int = 0,
+        time_embed_dim: int = 512,
+        ofs_embed_dim: Optional[int] = None,
+        text_embed_dim: int = 4096,
+        num_layers: int = 30,
+        dropout: float = 0.0,
+        attention_bias: bool = True,
+        sample_width: int = 90,
+        sample_height: int = 60,
+        sample_frames: int = 49,
+        patch_size: int = 2,
+        patch_size_t: Optional[int] = None,
+        temporal_compression_ratio: int = 4,
+        max_text_seq_length: int = 226,
+        activation_fn: str = "gelu-approximate",
+        timestep_activation_fn: str = "silu",
+        norm_elementwise_affine: bool = True,
+        norm_eps: float = 1e-5,
+        spatial_interpolation_scale: float = 1.875,
+        temporal_interpolation_scale: float = 1.0,
+        use_rotary_positional_embeddings: bool = False,
+        use_learned_positional_embeddings: bool = False,
+        patch_bias: bool = True,
+    ):
+        super().__init__()
+        inner_dim = num_attention_heads * attention_head_dim
+        if not use_rotary_positional_embeddings and use_learned_positional_embeddings:
+            raise ValueError(
+                "There are no CogVideoX checkpoints available with disable rotary embeddings and learned positional "
+                "embeddings. If you're using a custom model and/or believe this should be supported, please open an "
+                "issue at https://github.com/huggingface/diffusers/issues."
+            )
+        # 1. Patch embedding
+        self.patch_embed = CogVideoXPatchEmbedWBlur(
+            patch_size=patch_size,
+            patch_size_t=patch_size_t,
+            in_channels=in_channels,
+            embed_dim=inner_dim,
+            text_embed_dim=text_embed_dim,
+            bias=patch_bias,
+            sample_width=sample_width,
+            sample_height=sample_height,
+            sample_frames=sample_frames,
+            temporal_compression_ratio=temporal_compression_ratio,
+            max_text_seq_length=max_text_seq_length,
+            spatial_interpolation_scale=spatial_interpolation_scale,
+            temporal_interpolation_scale=temporal_interpolation_scale,
+            use_positional_embeddings=not use_rotary_positional_embeddings,
+            use_learned_positional_embeddings=use_learned_positional_embeddings,
+        )
+        self.embedding_dropout = nn.Dropout(dropout)
+        # 2. Time embeddings and ofs embedding(Only CogVideoX1.5-5B I2V have)
+        self.time_proj = Timesteps(inner_dim, flip_sin_to_cos, freq_shift)
+        self.time_embedding = TimestepEmbedding(inner_dim, time_embed_dim, timestep_activation_fn)
+        self.ofs_proj = None
+        self.ofs_embedding = None
+        if ofs_embed_dim:
+            self.ofs_proj = Timesteps(ofs_embed_dim, flip_sin_to_cos, freq_shift)
+            self.ofs_embedding = TimestepEmbedding(
+                ofs_embed_dim, ofs_embed_dim, timestep_activation_fn
+            )  # same as time embeddings, for ofs
+        # 3. Define spatio-temporal transformers blocks
+        self.transformer_blocks = nn.ModuleList(
+            [
+                CogVideoXBlock(
+                    dim=inner_dim,
+                    num_attention_heads=num_attention_heads,
+                    attention_head_dim=attention_head_dim,
+                    time_embed_dim=time_embed_dim,
+                    dropout=dropout,
+                    activation_fn=activation_fn,
+                    attention_bias=attention_bias,
+                    norm_elementwise_affine=norm_elementwise_affine,
+                    norm_eps=norm_eps,
+                )
+                for _ in range(num_layers)
+            ]
+        )
+        self.norm_final = nn.LayerNorm(inner_dim, norm_eps, norm_elementwise_affine)
+        # 4. Output blocks
+        self.norm_out = AdaLayerNorm(
+            embedding_dim=time_embed_dim,
+            output_dim=2 * inner_dim,
+            norm_elementwise_affine=norm_elementwise_affine,
+            norm_eps=norm_eps,
+            chunk_dim=1,
+        )
+        if patch_size_t is None:
+            # For CogVideox 1.0
+            output_dim = patch_size * patch_size * out_channels
+        else:
+            # For CogVideoX 1.5
+            output_dim = patch_size * patch_size * patch_size_t * out_channels
+        self.proj_out = nn.Linear(inner_dim, output_dim)
+        self.gradient_checkpointing = False
+    def _set_gradient_checkpointing(self, module, value=False):
+        self.gradient_checkpointing = value
+    @property
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor()
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+            return processors
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+        return processors
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
+    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+        r"""
+        Sets the attention processor to use to compute attention.
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+        """
+        count = len(self.attn_processors.keys())
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"))
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections with FusedAttnProcessor2_0->FusedCogVideoXAttnProcessor2_0
+    def fuse_qkv_projections(self):
+        """
+        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
+        are fused. For cross-attention modules, key and value projection matrices are fused.
+        <Tip warning={true}>
+        This API is 🧪 experimental.
+        </Tip>
+        """
+        self.original_attn_processors = None
+        for _, attn_processor in self.attn_processors.items():
+            if "Added" in str(attn_processor.__class__.__name__):
+                raise ValueError("`fuse_qkv_projections()` is not supported for models having added KV projections.")
+        self.original_attn_processors = self.attn_processors
+        for module in self.modules():
+            if isinstance(module, Attention):
+                module.fuse_projections(fuse=True)
+        self.set_attn_processor(FusedCogVideoXAttnProcessor2_0())
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.unfuse_qkv_projections
+    def unfuse_qkv_projections(self):
+        """Disables the fused QKV projection if enabled.
+        <Tip warning={true}>
+        This API is 🧪 experimental.
+        </Tip>
+        """
+        if self.original_attn_processors is not None:
+            self.set_attn_processor(self.original_attn_processors)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        timestep: Union[int, float, torch.LongTensor],
+        intervals: Optional[torch.Tensor],
+        condition_mask: Optional[torch.Tensor] = None,
+        timestep_cond: Optional[torch.Tensor] = None,
+        ofs: Optional[Union[int, float, torch.LongTensor]] = None,
+        image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        attention_kwargs: Optional[Dict[str, Any]] = None,
+        return_dict: bool = True,
+    ):
+        if attention_kwargs is not None:
+            attention_kwargs = attention_kwargs.copy()
+            lora_scale = attention_kwargs.pop("scale", 1.0)
+        else:
+            lora_scale = 1.0
+        if USE_PEFT_BACKEND:
+            # weight the lora layers by setting `lora_scale` for each PEFT layer
+            scale_lora_layers(self, lora_scale)
+        else:
+            if attention_kwargs is not None and attention_kwargs.get("scale", None) is not None:
+                logger.warning(
+                    "Passing `scale` via `attention_kwargs` when not using the PEFT backend is ineffective."
+                )
+        batch_size, num_frames, channels, height, width = hidden_states.shape
+        # 1. Time embedding
+        timesteps = timestep
+        t_emb = self.time_proj(timesteps)
+        # timesteps does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        t_emb = t_emb.to(dtype=hidden_states.dtype)
+        emb = self.time_embedding(t_emb, timestep_cond)
+        if self.ofs_embedding is not None:
+            ofs_emb = self.ofs_proj(ofs)
+            ofs_emb = ofs_emb.to(dtype=hidden_states.dtype)
+            ofs_emb = self.ofs_embedding(ofs_emb)
+            emb = emb + ofs_emb
+        # 2. Patch embedding
+        hidden_states = self.patch_embed(encoder_hidden_states, hidden_states, intervals, condition_mask)
+        hidden_states = self.embedding_dropout(hidden_states)
+        text_seq_length = encoder_hidden_states.shape[1]
+        encoder_hidden_states = hidden_states[:, :text_seq_length]
+        hidden_states = hidden_states[:, text_seq_length:]
+        # 3. Transformer blocks
+        for i, block in enumerate(self.transformer_blocks):
+            if torch.is_grad_enabled() and self.gradient_checkpointing:
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+                    return custom_forward
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                hidden_states, encoder_hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
+                    hidden_states,
+                    encoder_hidden_states,
+                    emb,
+                    image_rotary_emb,
+                    **ckpt_kwargs,
+                )
+            else:
+                hidden_states, encoder_hidden_states = block(
+                    hidden_states=hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    temb=emb,
+                    image_rotary_emb=image_rotary_emb,
+                )
+        if not self.config.use_rotary_positional_embeddings:
+            # CogVideoX-2B
+            hidden_states = self.norm_final(hidden_states)
+        else:
+            # CogVideoX-5B
+            hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1)
+            hidden_states = self.norm_final(hidden_states)
+            hidden_states = hidden_states[:, text_seq_length:]
+        # 4. Final block
+        hidden_states = self.norm_out(hidden_states, temb=emb)
+        hidden_states = self.proj_out(hidden_states)
+        # 5. Unpatchify
+        p = self.config.patch_size
+        p_t = self.config.patch_size_t
+        if p_t is None:
+            output = hidden_states.reshape(batch_size, num_frames, height // p, width // p, -1, p, p)
+            output = output.permute(0, 1, 4, 2, 5, 3, 6).flatten(5, 6).flatten(3, 4)
+        else:
+            output = hidden_states.reshape(
+                batch_size, (num_frames + p_t - 1) // p_t, height // p, width // p, -1, p_t, p, p
+            )
+            output = output.permute(0, 1, 5, 4, 2, 6, 3, 7).flatten(6, 7).flatten(4, 5).flatten(1, 2)
+        if USE_PEFT_BACKEND:
+            # remove `lora_scale` from each PEFT layer
+            unscale_lora_layers(self, lora_scale)
+        if not return_dict:
+            return (output,)
+        return Transformer2DModelOutput(sample=output)

controlnet_pipeline.py ADDED Viewed

	@@ -0,0 +1,733 @@

+import inspect
+import math
+from typing import Callable, Dict, List, Optional, Tuple, Union
+import torch
+import numpy as np
+from PIL import Image
+from torchvision import transforms
+from einops import rearrange, repeat
+from transformers import T5EncoderModel, T5Tokenizer
+from diffusers.video_processor import VideoProcessor
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.models.embeddings import get_3d_rotary_pos_embed
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.models import AutoencoderKLCogVideoX, CogVideoXTransformer3DModel
+from diffusers import CogVideoXDDIMScheduler, CogVideoXDPMScheduler
+from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback
+from diffusers.pipelines.cogvideo.pipeline_cogvideox import CogVideoXPipelineOutput, CogVideoXLoraLoaderMixin
+from training.helpers import random_insert_latent_frame, transform_intervals
+import torch.nn.functional as F
+from torch.utils.checkpoint import checkpoint
+def resize_for_crop(image, crop_h, crop_w):
+    img_h, img_w = image.shape[-2:]
+    if img_h >= crop_h and img_w >= crop_w:
+        coef = max(crop_h / img_h, crop_w / img_w)
+    elif img_h <= crop_h and img_w <= crop_w:
+        coef = max(crop_h / img_h, crop_w / img_w)
+    else:
+        coef = crop_h / img_h if crop_h > img_h else crop_w / img_w
+    out_h, out_w = int(img_h * coef), int(img_w * coef)
+    resized_image = transforms.functional.resize(image, (out_h, out_w), antialias=True)
+    return resized_image
+def prepare_frames(input_images, video_size, do_resize=True, do_crop=True):
+    input_images = np.stack([np.array(x) for x in input_images])
+    images_tensor = torch.from_numpy(input_images).permute(0, 3, 1, 2) / 127.5 - 1
+    if do_resize:
+        images_tensor = [resize_for_crop(x, crop_h=video_size[0], crop_w=video_size[1]) for x in images_tensor]
+    if do_crop:
+        images_tensor = [transforms.functional.center_crop(x, video_size) for x in images_tensor]
+    if isinstance(images_tensor, list):
+        images_tensor = torch.stack(images_tensor)
+    return images_tensor.unsqueeze(0)
+def get_resize_crop_region_for_grid(src, tgt_width, tgt_height):
+    tw = tgt_width
+    th = tgt_height
+    h, w = src
+    r = h / w
+    if r > (th / tw):
+        resize_height = th
+        resize_width = int(round(th / h * w))
+    else:
+        resize_width = tw
+        resize_height = int(round(tw / w * h))
+    crop_top = int(round((th - resize_height) / 2.0))
+    crop_left = int(round((tw - resize_width) / 2.0))
+    return (crop_top, crop_left), (crop_top + resize_height, crop_left + resize_width)
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+class ControlnetCogVideoXPipeline(DiffusionPipeline, CogVideoXLoraLoaderMixin):
+    _optional_components = []
+    model_cpu_offload_seq = "text_encoder->transformer->vae"
+    _callback_tensor_inputs = [
+        "latents",
+        "prompt_embeds",
+        "negative_prompt_embeds",
+    ]
+    def __init__(
+        self,
+        tokenizer: T5Tokenizer,
+        text_encoder: T5EncoderModel,
+        vae: AutoencoderKLCogVideoX,
+        transformer: CogVideoXTransformer3DModel,
+        scheduler: Union[CogVideoXDDIMScheduler, CogVideoXDPMScheduler],
+    ):
+        super().__init__()
+        self.register_modules(
+            tokenizer=tokenizer, text_encoder=text_encoder, vae=vae, transformer=transformer, scheduler=scheduler
+        )
+        self.vae_scale_factor_spatial = (
+            2 ** (len(self.vae.config.block_out_channels) - 1) if hasattr(self, "vae") and self.vae is not None else 8
+        )
+        self.vae_scale_factor_temporal = (
+            self.vae.config.temporal_compression_ratio if hasattr(self, "vae") and self.vae is not None else 4
+        )
+        self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor_spatial)
+    def _get_t5_prompt_embeds(
+        self,
+        prompt: Union[str, List[str]] = None,
+        num_videos_per_prompt: int = 1,
+        max_sequence_length: int = 226,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        device = device or self._execution_device
+        dtype = dtype or self.text_encoder.dtype
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt)
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=max_sequence_length,
+            truncation=True,
+            add_special_tokens=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_sequence_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because `max_sequence_length` is set to "
+                f" {max_sequence_length} tokens: {removed_text}"
+            )
+        # Had to disable auto cast here, otherwise the text encoder produces NaNs.
+        # Hope it doesn't break training
+        with torch.autocast(device_type=device.type, enabled=False):
+            prompt_embeds = self.text_encoder(text_input_ids.to(device))[0]
+        # prompt embeds is nan here!
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        _, seq_len, _ = prompt_embeds.shape
+        prompt_embeds = prompt_embeds.repeat(1, num_videos_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(batch_size * num_videos_per_prompt, seq_len, -1)
+        return prompt_embeds
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        do_classifier_free_guidance: bool = True,
+        num_videos_per_prompt: int = 1,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        max_sequence_length: int = 226,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
+                Whether to use classifier free guidance or not.
+            num_videos_per_prompt (`int`, *optional*, defaults to 1):
+                Number of videos that should be generated per prompt. torch device to place the resulting embeddings on
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            device: (`torch.device`, *optional*):
+                torch device
+            dtype: (`torch.dtype`, *optional*):
+                torch dtype
+        """
+        device = device or self._execution_device
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        if prompt is not None:
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        if prompt_embeds is None:
+            prompt_embeds = self._get_t5_prompt_embeds(
+                prompt=prompt,
+                num_videos_per_prompt=num_videos_per_prompt,
+                max_sequence_length=max_sequence_length,
+                device=device,
+                dtype=dtype,
+            )
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            negative_prompt = negative_prompt or ""
+            negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
+            if prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            negative_prompt_embeds = self._get_t5_prompt_embeds(
+                prompt=negative_prompt,
+                num_videos_per_prompt=num_videos_per_prompt,
+                max_sequence_length=max_sequence_length,
+                device=device,
+                dtype=dtype,
+            )
+        return prompt_embeds, negative_prompt_embeds
+    def prepare_latents(
+        self, batch_size, num_channels_latents, num_frames, height, width, dtype, device, generator, latents=None
+    ):
+        shape = (
+            batch_size,
+            (num_frames - 1) // self.vae_scale_factor_temporal + 1,
+            num_channels_latents,
+            height // self.vae_scale_factor_spatial,
+            width // self.vae_scale_factor_spatial,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+    def prepare_image_latents(self,
+        image: torch.Tensor,
+        batch_size: int = 1,
+        num_channels_latents: int = 16,
+        num_frames: int = 13,
+        height: int = 60,
+        width: int = 90,
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[torch.device] = None,
+        generator: Optional[torch.Generator] = None,
+        latents: Optional[torch.Tensor] = None,):
+        image_prepared = prepare_frames(image, (height, width)).to(device).to(dtype=dtype).permute(0, 2, 1, 3, 4)  # [B, C, F, H, W]
+        image_latents = [retrieve_latents(self.vae.encode(image_prepared), generator)]
+        image_latents = torch.cat(image_latents, dim=0).to(dtype).permute(0, 2, 1, 3, 4)  # [B, F, C, H, W]
+        if not self.vae.config.invert_scale_latents:
+            image_latents = self.vae_scaling_factor_image * image_latents
+        else:
+            # This is awkward but required because the CogVideoX team forgot to multiply the
+            # scaling factor during training :)
+            image_latents = 1 / self.vae_scaling_factor_image * image_latents
+        # else:
+        #     # This is awkward but required because the CogVideoX team forgot to multiply the
+        #     # scaling factor during training :)
+        #     image_latents = 1 / self.vae_scaling_factor_image * image_latents
+        return image_prepared, image_latents
+    # def decode_latents(self, latents: torch.Tensor) -> torch.Tensor:
+    #     latents = latents.permute(0, 2, 1, 3, 4)  # [batch_size, num_channels, num_frames, height, width]
+    #     latents = 1 / self.vae.config.scaling_factor * latents
+    #     frames = self.vae.decode(latents).sample
+    #     return frames
+    def decode_latents(self, latents: torch.Tensor) -> torch.Tensor:
+        latents = latents.permute(0, 2, 1, 3, 4)  # [B, C, T, H, W]
+        latents = 1 / self.vae.config.scaling_factor * latents
+        def decode_fn(x):
+            return self.vae.decode(x).sample
+        # Use checkpointing to save memory
+        frames = checkpoint(decode_fn, latents, use_reentrant=False)
+        return frames
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+    # Copied from diffusers.pipelines.latte.pipeline_latte.LattePipeline.check_inputs
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        negative_prompt,
+        callback_on_step_end_tensor_inputs,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        if prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+    def fuse_qkv_projections(self) -> None:
+        r"""Enables fused QKV projections."""
+        self.fusing_transformer = True
+        self.transformer.fuse_qkv_projections()
+    def unfuse_qkv_projections(self) -> None:
+        r"""Disable QKV projection fusion if enabled."""
+        if not self.fusing_transformer:
+            logger.warning("The Transformer was not initially fused for QKV projections. Doing nothing.")
+        else:
+            self.transformer.unfuse_qkv_projections()
+            self.fusing_transformer = False
+    def _prepare_rotary_positional_embeddings(
+        self,
+        height: int,
+        width: int,
+        num_frames: int,
+        device: torch.device,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        grid_height = height // (self.vae_scale_factor_spatial * self.transformer.config.patch_size)
+        grid_width = width // (self.vae_scale_factor_spatial * self.transformer.config.patch_size)
+        base_size_width = 720 // (self.vae_scale_factor_spatial * self.transformer.config.patch_size)
+        base_size_height = 480 // (self.vae_scale_factor_spatial * self.transformer.config.patch_size)
+        grid_crops_coords = get_resize_crop_region_for_grid(
+            (grid_height, grid_width), base_size_width, base_size_height
+        )
+        freqs_cos, freqs_sin = get_3d_rotary_pos_embed(
+            embed_dim=self.transformer.config.attention_head_dim,
+            crops_coords=grid_crops_coords,
+            grid_size=(grid_height, grid_width),
+            temporal_size=num_frames,
+        )
+        freqs_cos = freqs_cos.to(device=device)
+        freqs_sin = freqs_sin.to(device=device)
+        return freqs_cos, freqs_sin
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+    @property
+    def attention_kwargs(self):
+        return self._attention_kwargs
+    @property
+    def interrupt(self):
+        return self._interrupt
+    @torch.no_grad()
+    def __call__(
+        self,
+        image,
+        input_intervals,
+        output_intervals,
+        prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        height: int = 480,
+        width: int = 720,
+        num_frames: int = 49,
+        num_inference_steps: int = 50,
+        timesteps: Optional[List[int]] = None,
+        guidance_scale: float = 6,
+        use_dynamic_cfg: bool = False,
+        num_videos_per_prompt: int = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: str = "pil",
+        return_dict: bool = True,
+        callback_on_step_end: Optional[
+            Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
+        ] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        max_sequence_length: int = 226,
+    ) -> Union[CogVideoXPipelineOutput, Tuple]:
+        if num_frames > 49:
+            raise ValueError(
+                "The number of frames must be less than 49 for now due to static positional embeddings. This will be updated in the future to remove this limitation."
+            )
+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+        height = height or self.transformer.config.sample_size * self.vae_scale_factor_spatial
+        width = width or self.transformer.config.sample_size * self.vae_scale_factor_spatial
+        num_videos_per_prompt = 1
+        self.vae_scaling_factor_image = self.vae.config.scaling_factor if getattr(self, "vae", None) else 0.7
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            height,
+            width,
+            negative_prompt,
+            callback_on_step_end_tensor_inputs,
+            prompt_embeds,
+            negative_prompt_embeds,
+        )
+        self._guidance_scale = guidance_scale
+        self._interrupt = False
+        # 2. Default call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+        # 3. Encode input prompt
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            negative_prompt,
+            do_classifier_free_guidance,
+            num_videos_per_prompt=num_videos_per_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            max_sequence_length=max_sequence_length,
+            device=device,
+        )
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+        # 4. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
+        self._num_timesteps = len(timesteps)
+        # 5. Prepare latents.
+        latent_channels = 16 #self.transformer.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_videos_per_prompt,
+            latent_channels,
+            num_frames,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+        image_prepared, image_latents = self.prepare_image_latents(
+            image,
+            batch_size=batch_size,
+            num_channels_latents=latent_channels,
+            num_frames=num_frames,
+            height=height,
+            width=width,
+            dtype=prompt_embeds.dtype,
+            device=device,
+            generator=generator,
+        )
+        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        # 8. Create rotary embeds if required - THIS IS NOT USED
+        image_rotary_emb = (
+            self._prepare_rotary_positional_embeddings(height, width, latents.size(1), device)
+            if self.transformer.config.use_rotary_positional_embeddings
+            else None
+        )
+        # 9. Denoising loop
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+        input_intervals = input_intervals.to(device)
+        output_intervals = output_intervals.to(device)
+        input_intervals = transform_intervals(input_intervals)
+        output_intervals = transform_intervals(output_intervals)
+        latents_initial, target, condition_mask, intervals = random_insert_latent_frame(image_latents, latents, latents, input_intervals, output_intervals, special_info="just_one")
+        latents = latents_initial.clone()
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            # for DPM-solver++
+            old_pred_original_sample = None
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                #replace first latent with image_latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                if do_classifier_free_guidance:
+                    latent_model_input[0][condition_mask[0]] = 0 #set unconditioned latents to 0
+                    #TODO: Replace the conditional latents with the input latents
+                    latent_model_input[1][condition_mask[0]] = latents_initial[0][condition_mask[0]].to(latent_model_input.dtype)
+                else:
+                    latent_model_input[:, condition_mask[0]] = latents_initial[0][condition_mask[0]].to(latent_model_input.dtype)
+                timestep = t.expand(latent_model_input.shape[0])
+                current_sampling_percent = i / len(timesteps)
+                latent_model_input = latent_model_input.to(dtype=self.transformer.dtype)
+                prompt_embeds = prompt_embeds.to(dtype=self.transformer.dtype)
+                # predict noise model_output
+                noise_pred = self.transformer(
+                    hidden_states=latent_model_input,
+                    encoder_hidden_states=prompt_embeds,
+                    timestep=timestep,
+                    intervals=intervals,
+                    condition_mask=condition_mask,
+                    image_rotary_emb=image_rotary_emb,
+                    return_dict=False,
+                )[0]
+                noise_pred = noise_pred.float()
+                # perform guidance
+                if use_dynamic_cfg:
+                    self._guidance_scale = 1 + guidance_scale * (
+                        (1 - math.cos(math.pi * ((num_inference_steps - t.item()) / num_inference_steps) ** 5.0)) / 2
+                    )
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_cond = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_cond - noise_pred_uncond)
+                    #so I think the problem is that the conditional noise doesn't have a realistic noise prediction on its own frame
+                    #what I really need to do is replace the unconditional noise at that frame
+                # compute the previous noisy sample x_t -> x_t-1
+                if not isinstance(self.scheduler, CogVideoXDPMScheduler):
+                    latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+                else:
+                    latents, old_pred_original_sample = self.scheduler.step(
+                        noise_pred,
+                        old_pred_original_sample,
+                        t,
+                        timesteps[i - 1] if i > 0 else None,
+                        latents,
+                        **extra_step_kwargs,
+                        return_dict=False,
+                    )
+                latents = latents.to(prompt_embeds.dtype)
+                # call the callback, if provided
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+        #after exiting replace the conditioning latent with image_latents
+        #latents[:, motion_blur_amount:motion_blur_amount+1] = image_latents[:, 0:1]
+        if not output_type == "latent":
+            latents = latents[~condition_mask].unsqueeze(0)
+            video = self.decode_latents(latents)
+            video = self.video_processor.postprocess_video(video=video, output_type=output_type)
+        else:
+            video = latents
+        # Offload all models
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return (video,)
+        return CogVideoXPipelineOutput(frames=video)

extra/checkpoints_to_hf.py ADDED Viewed

	@@ -0,0 +1,16 @@

+from huggingface_hub import HfApi
+import os
+#run with HF_TOKEN = your_hf_token before python_command
+api = HfApi(token=os.getenv("HF_TOKEN"))
+folders = ["/datasets/sai/blur2vid/training/cogvideox-baist-test",
+           "/datasets/sai/blur2vid/training/cogvideox-gopro-test",
+           "/datasets/sai/blur2vid/training/cogvideox-gopro-2x-test",
+           "/datasets/sai/blur2vid/training/cogvideox-full-test",
+           "/datasets/sai/blur2vid/training/cogvideox-outsidephotos"]
+for folder in folders:
+    api.upload_folder(
+        folder_path=folder,
+        repo_id="tedlasai/blur2vid",
+        repo_type="model",
+        path_in_repo=os.path.basename(folder)
+    )

extra/moMets-parallel-baist.py ADDED Viewed

	@@ -0,0 +1,330 @@

+# Motion Metrics
+from concurrent.futures import ProcessPoolExecutor, as_completed
+import numpy as np
+np.float = np.float64
+np.int = np.int_
+import os
+from cdfvd import fvd
+from skimage.metrics import structural_similarity
+import torch
+import lpips
+#from DISTS_pytorch import DISTS
+#import colour as c
+#from torchmetrics.image.fid import FrechetInceptionDistance
+import torch.nn.functional as F
+from epe_metric import compute_bidirectional_epe as epe
+import pdb
+import multiprocessing
+import cv2
+import glob
+# init
+dataDir = 'BAISTResultsImages' # 'dataGoPro' #
+gtDir = 'GT' #'GT' #
+methodDirs = ['Ours', 'Animation-from-blur'] #['Favaro','MotionETR','Ours','GOPROGeneralize']  #
+depth = 8
+resFile = './kellytest.npy'#resultsGoPro20250520.npy'#
+patchDim = 32 #64 #
+pixMax = 1.0
+nMets = 7 # new results: scoreFVD, scorePWPSNR, scoreEPE, scorePatchSSIM, scorePatchLPIPS, scorePSNR
+compute = True # if False, load previously computed
+eps = 1e-8
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+def read_pngs_to_array(path):
+    """Read all PNGs in `path`, sort them by filename, convert BGR→RGB, and stack into an np.ndarray."""
+    return np.stack([
+        cv2.imread(f, cv2.IMREAD_UNCHANGED)[..., ::-1]
+        for f in sorted(glob.glob(f"{path}/*.png"))
+    ])
+# Use 'spawn' to avoid CUDA context issues
+multiprocessing.freeze_support()              # on Windows
+multiprocessing.set_start_method('spawn', force=True)
+def compute_method(results_local, methodDir, files, countMethod):
+    fnLPIPS = lpips.LPIPS(net='alex').to(device)
+    #fnDISTS = DISTS().to(device)
+    fnFVD = fvd.cdfvd(model='videomae', device=device)
+    countFile = -1
+    for file in files:
+        countFile+=1
+        # pull frames from MP4
+        pathMethod = os.path.join(dataDir, methodDir, file)
+        framesMethod = np.clip(read_pngs_to_array(pathMethod).astype(np.float32) / (2**depth-1),0,1)
+        pathGT = os.path.join(dataDir, gtDir, file)
+        framesGT = np.clip(read_pngs_to_array(pathGT).astype(np.float32) / (2**depth-1),0,1)
+        #make sure the GT and method have the same shape
+        assert framesGT.shape == framesMethod.shape, f"GT shape {framesGT.shape} does not match method shape {framesMethod.shape} for file {file}"
+        # video metrics
+        # vmaf
+        #scoreVMAF = callVMAF(pathGT, pathMethod)
+        # epe - we have to change to tensors here
+        framesMethodTensor = torch.from_numpy(framesMethod)
+        framesGTtensor = torch.from_numpy(framesGT)
+        scoreEPE = epe(framesMethodTensor[0,:,:,:], framesMethodTensor[-1,:,:,:], framesGTtensor[0,:,:,:], framesGTtensor[-1,:,:,:], per_pixel_mode=True).cpu().detach().numpy()
+        # motion blur baseline
+        blurryGT = np.mean(framesGT ** 2.2,axis=0) ** (1/2.2)
+        blurryMethod = np.mean(framesMethod ** 2.2,axis=0) ** (1/2.2)
+        # MSE -> PSNR
+        mapBlurryMSE = (blurryGT - blurryMethod)**2
+        scoreBlurryMSE = np.mean(mapBlurryMSE)
+        scoreBlurryPSNR = (10 * np.log10(pixMax**2 / scoreBlurryMSE))
+        # fvd
+        #scoreFVD = fnFVD.compute_fvd(real_videos=(np.expand_dims(framesGT, axis=0)*(2**depth-1)).astype(np.uint8), fake_videos=(np.expand_dims(framesMethod, axis=0)*(2**depth-1)).astype(np.uint8))
+        framesGTfvd = np.expand_dims((framesGT * (2**depth-1)).astype(np.uint8), axis=0)
+        fnFVD.add_real_stats(framesGTfvd)
+        framesMethodFVD = np.expand_dims((framesMethod * (2**depth-1)).astype(np.uint8), axis=0)
+        fnFVD.add_fake_stats(framesMethodFVD)
+        # loop directions
+        framesMSE = np.stack((framesGT,framesGT)) # pre allocate array for directional PSNR maps
+        countDirect = -1
+        for direction in directions:
+            countDirect = countDirect+1
+            order = direction
+            # loop frames + image level metrics
+            countFrames = -1
+            for i in order:
+                countFrames+=1
+                frameMethod = framesMethod[i,:,:,:] # method frames can be re-ordered
+                frameGT =  framesGT[countFrames,:,:,:]
+                #assert patch size is divisible by image size
+                rows, cols, ch = frameGT.shape
+                assert rows % patchDim == 0, f"rows {rows} is not divisible by patchDim {patchDim}"
+                assert cols % patchDim == 0, f"cols {cols} is not divisible by patchDim {patchDim}"
+                rPatch = np.ceil(rows/patchDim)
+                cPatch = np.ceil(cols/patchDim)
+                # LPIPS
+                #pdb.set_trace()
+                methodTensor = (torch.from_numpy(np.moveaxis(frameMethod, -1, 0)).unsqueeze(0) * 2 - 1).to(device)
+                gtTensor = (torch.from_numpy(np.moveaxis(frameGT, -1, 0)).unsqueeze(0) * 2 - 1).to(device)
+                #scoreLPIPS = fnLPIPS(gtTensor, methodTensor).squeeze(0,1,2).cpu().detach().numpy()[0]
+                # FID
+                #fnFID.update((gtTensor * (2**depth - 1)).to(torch.uint8), real=True)
+                #fnFID.update((methodTensor * (2**depth - 1)).to(torch.uint8), real=False)
+                # DISTS
+                #scoreDISTS = fnDISTS(gtTensor.to(torch.float), methodTensor.to(torch.float), require_grad=True, batch_average=True).cpu().detach().numpy()
+                # compute ssim
+                #scoreSSIM = structural_similarity(frameGT, frameMethod, data_range=pixMax, channel_axis=2)
+                # compute DE 2000
+                #frameMethodXYZ = c.RGB_to_XYZ(frameMethod, c.models.RGB_COLOURSPACE_sRGB, apply_cctf_decoding=True)
+                #frameMethodLAB = c.XYZ_to_Lab(frameMethodXYZ)
+                #frameGTXYZ = c.RGB_to_XYZ(frameGT, c.models.RGB_COLOURSPACE_sRGB, apply_cctf_decoding=True)
+                #frameGTLAB = c.XYZ_to_Lab(frameGTXYZ)
+                #mapDE2000 = c.delta_E(frameGTLAB, frameMethodLAB, method='CIE 2000')
+                #scoreDE2000 = np.mean(mapDE2000)
+                # MSE
+                mapMSE = (frameGT - frameMethod)**2
+                scoreMSE = np.mean(mapMSE)
+                # PSNR
+                framesMSE[countDirect,countFrames,:,:,:] = mapMSE
+                #framesPSNR[countDirect,countFrames,:,:,:] = np.clip((10 * np.log10(pixMax**2 / np.clip(mapMSE,a_min=1e-10,a_max=None))),0,100)
+                scorePSNR = (10 * np.log10(pixMax**2 / scoreMSE))
+                #for l in range(ch):
+                    # channel-wise metrics
+                    #chanFrameMethod = frameMethod[:,:,l]
+                    #chanFrameGT = frameGT[:,:,l]
+                # loop patches rows
+                for j in range(int(rPatch)):
+                    # loop patches cols + patch level metrics
+                    for k in range(int(cPatch)):
+                        startR = j*patchDim
+                        startC = k*patchDim
+                        endR = j*patchDim+patchDim
+                        endC = k*patchDim+patchDim
+                        if endR > rows:
+                            endR = rows
+                        else:
+                            pass
+                        if endC > cols:
+                            endC = cols
+                        else:
+                            pass
+                        # patch metrics
+                        #patchMSE = np.mean(mapMSE[startR:endR,startC:endC,:])
+                        #scorePatchPSNR = np.clip((10 * np.log10(pixMax**2 / patchMSE)),0,100)
+                        if dataDir == 'BAISTResultsImages':
+                            patchGtTensor = F.interpolate(gtTensor[:,:,startR:endR,startC:endC], scale_factor=2.0, mode='bilinear', align_corners=False)
+                            patchMethodTensor = F.interpolate(methodTensor[:,:,startR:endR,startC:endC], scale_factor=2.0, mode='bilinear', align_corners=False)
+                            scorePatchLPIPS = fnLPIPS(patchGtTensor, patchMethodTensor).squeeze(0,1,2).cpu().detach().numpy()[0]
+                        else:
+                            scorePatchLPIPS = fnLPIPS(gtTensor[:,:,startR:endR,startC:endC], methodTensor[:,:,startR:endR,startC:endC]).squeeze(0,1,2).cpu().detach().numpy()[0]
+                        scorePatchSSIM = structural_similarity(frameGT[startR:endR,startC:endC,:], frameMethod[startR:endR,startC:endC,:], data_range=pixMax, channel_axis=2)
+                        #scorePatchDISTS = fnDISTS(gtTensor[:,:,startR:endR,startC:endC].to(torch.float), methodTensor[:,:,startR:endR,startC:endC].to(torch.float), require_grad=True, batch_average=True).cpu().detach().numpy()
+                        #scorePatchDE2000 = np.mean(mapDE2000[startR:endR,startC:endC])
+                        # i: frame number, j: patch row, k: patch col
+                        #results[countMethod,countFile,countDirect,i,j,k,3:] = [scoreEPE, scoreBlurryPSNR, scoreLPIPS, scoreDISTS, scoreSSIM, scoreDE2000, scorePSNR, scorePatchPSNR, scorePatchSSIM, scorePatchLPIPS, scorePatchDISTS, scorePatchDE2000]
+                        results_local[countMethod,countFile,countDirect,i,j,k,2:] = [scoreEPE, scoreBlurryPSNR, scorePatchSSIM, scorePatchLPIPS, scorePSNR]
+                        print('Method: ', methodDir, ' File: ', file, ' Frame: ', str(i), ' PSNR: ', scorePSNR,  end='\r')
+                        #print('VMAF: ', str(scoreVMAF), ' FVD: ', str(scoreFVD), ' LPIPS: ', str(scoreLPIPS), ' FID: ', str(scoreFID), ' DISTS: ', str(scoreDISTS), ' SSIM: ', str(scoreSSIM), ' DE2000: ', str(scoreDE2000), ' PSNR: ', str(scorePSNR), ' Patch PSNR: ', str(scorePatchPSNR), end='\r')
+        #pdb.set_trace()
+        scorePWPSNR = (10 * np.log10(pixMax**2 / np.mean(np.min(np.mean(framesMSE, axis=(1)),axis=0)))) # take max pixel wise PSNR per direction, average over image dims
+        #print('Method: ', methodDir, ' File: ', file, ' Frame: ', str(i), ' PWPSNR: ', scorePWPSNR,  end='\n')
+        #scorePWPSNR = np.clip((10 * np.log10(pixMax**2 / np.mean(np.min(framesPSNR, axis=0),axis=(1,2,3)))),0,100) # take max pixel wise PSNR per direction, average over image dims
+        results_local[countMethod,countFile,:,:,:,:,1] = np.tile(scorePWPSNR, results_local.shape[2:-1])#np.broadcast_to(scorePWPSNR[:, np.newaxis, np.newaxis], results.shape[3:-1])
+        np.save(resFile, results_local) # save part of the way through the loop ..
+    #scoreFID = fnFID.compute().cpu().detach().numpy()
+    #fnFID.reset()
+    #results[countMethod,:,:,:,:,:,0] = np.tile(scoreFID, results.shape[1:-1])
+    scoreFVD = fnFVD.compute_fvd_from_stats()
+    fnFVD.empty_real_stats()
+    fnFVD.empty_fake_stats()
+    results_local[countMethod,:,:,:,:,:,0] = np.tile(scoreFVD, results_local.shape[1:-1])
+    print('Results computed .. analyzing ..')
+    return results_local
+    # init results matrix
+path = os.path.join(dataDir, gtDir)
+clipDirs = [name for name in os.listdir(path) if os.path.isdir(os.path.join(path, name))]
+files = []
+if dataDir == 'BAISTResultsImages':
+    extraFknDir = 'blur'
+else:
+    extraFknDir = ''
+for clipDir in clipDirs:
+    path = os.path.join(dataDir, gtDir, clipDir, extraFknDir)
+    files = files + [os.path.join(clipDir,extraFknDir,name) for name in os.listdir(path)]
+files = sorted(files)
+path = os.path.join(dataDir, methodDirs[0], files[0])
+testFileGT = read_pngs_to_array(path)
+frams,rows,cols,ch = testFileGT.shape
+framRange = [i for i in range(frams)]
+directions = [framRange, framRange[::-1]]
+#loop through all methods and make sure they all have the same directory structure and same number of files
+for methodDir in methodDirs:
+    path = os.path.join(dataDir, methodDir)
+    clipDirs = [name for name in os.listdir(path) if os.path.isdir(os.path.join(path, name))]
+    filesMethod = []
+    for clipDir in clipDirs:
+        path = os.path.join(dataDir, methodDir, clipDir, extraFknDir)
+        filesMethod = filesMethod + [os.path.join(clipDir,extraFknDir,name) for name in os.listdir(path)]
+    filesMethod = sorted(filesMethod)
+    assert len(files) == len(filesMethod), f"Number of files in {methodDir} does not match GT number of files"
+    assert files == filesMethod, f"Files in {methodDir} do not match GT files"
+def main():
+    results = np.zeros((len(methodDirs),len(files),len(directions),frams,int(np.ceil(rows/patchDim)),int(np.ceil(cols/patchDim)),nMets))
+    if compute:
+        # loop methods + compute dataset level metrics (after nested for loops)
+        import multiprocessing as mp
+        ctx = mp.get_context('spawn')
+        with ProcessPoolExecutor(mp_context=ctx, max_workers=len(methodDirs)) as executor:
+            # submit one job per method
+            futures = {
+                executor.submit(compute_method, np.copy(results), md, files, idx): idx
+                for idx, md in enumerate(methodDirs)
+            }
+            # collect and merge results as they finish
+            for fut in as_completed(futures):
+                idx = futures[fut]
+                res_local = fut.result()
+                results[idx] = res_local[idx]
+    else:
+        results = np.load(resFile)
+    np.save(resFile, results)
+    # analyze
+    # new results: scoreFID, scoreFVD, scorePWPSNR, scoreEPE, scoreLPIPS, scoreDISTS, scoreSSIM, scoreDE2000, scorePSNR, scorePatchPSNR, scorePatchSSIM, scorePatchLPIPS, scorePatchDISTS, scorePatchDE2000
+    upMetrics = [1,3,4,6]
+    # 0508 results: scoreFID, scoreFVD, scoreLPIPS, scoreDISTS, scoreSSIM, scoreDE2000, scorePSNR, scorePatchPSNR, scorePatchSSIM, scorePatchLPIPS, scorePatchDISTS, scorePatchDE2000
+    #upMetrics = [4,6,7,8] # PSNR, SSIM, Patch PSNR, Patch SSIM
+    print("Results shape 1: ", results.shape)
+    forwardBackwardResults = np.mean(results,axis=(3))
+    #print("Results shape 2: ", forwardResults.shape)
+    maxDirResults = np.max(forwardBackwardResults,axis=(2))
+    minDirResults = np.min(forwardBackwardResults,axis=(2))
+    bestDirResults = minDirResults
+    #pdb.set_trace()
+    bestDirResults[:,:,:,:,upMetrics] = maxDirResults[:,:,:,:,upMetrics]
+    import pdb
+    #pdb.set_trace()
+    meanResults = bestDirResults.mean(axis=(1, 2, 3))  # Shape becomes (3, 6)
+    meanResultsT = meanResults.T
+    '''
+    maxDirResults = np.max(results,axis=2)
+    minDirResults = np.min(results,axis=2)
+    bestDirResults = minDirResults
+    bestDirResults[:,:,:,:,:,upMetrics] = maxDirResults[:,:,:,:,:,upMetrics]
+    meanResults = bestDirResults.mean(axis=(1, 2, 3, 4))  # Shape becomes (3, 6)
+    meanResultsT = meanResults.T
+    '''
+    #
+    #meanResults = forwardResults.mean(axis=(1, 2, 3, 4))  # Shape becomes (3, 6)
+    #meanResultsT = meanResults.T
+    # print latex table
+    method_labels = methodDirs
+    # results 0508: scoreLPIPS, scoreDISTS, scoreSSIM, scoreDE2000, scorePSNR, scorePatchPSNR, scorePatchSSIM, scorePatchLPIPS, scorePatchDISTS, scoreFID, scoreFVD
+    # metric_labels = ["FID $\downarrow$","FVD $\downarrow$","LPIPS $\downarrow$", "DISTS $\downarrow$", "SSIM $\downarrow$", "DE2000 $\downarrow$", "PSNR $\downarrow$", "Patch PSNR $\downarrow$", "Patch SSIM $\downarrow$",  "Patch LPIPS $\downarrow$", "Patch DISTS $\downarrow$", "Patch DE2000 $\downarrow$"]
+    # results 0517:
+    # metric_labels = ["FID $\downarrow$","FVD $\downarrow$","PWPSNR $\downarrow$","EPE $\downarrow$","BlurryPSNR $\downarrow$", "LPIPS $\downarrow$", "DISTS $\downarrow$", "SSIM $\downarrow$", "DE2000 $\downarrow$", "PSNR $\downarrow$", "Patch PSNR $\downarrow$", "Patch SSIM $\downarrow$",  "Patch LPIPS $\downarrow$", "Patch DISTS $\downarrow$", "Patch DE2000 $\downarrow$"]
+    # results 0518:
+    metric_labels = ["FVD $\downarrow$","PWPSNR $\downarrow$","EPE $\downarrow$","BlurryPSNR $\downarrow$","Patch SSIM $\downarrow$","Patch LPIPS $\downarrow$", "PSNR $\downarrow$"]
+    # appropriate for results 0507
+    #metric_labels = ["FID $\downarrow$", "FVD $\downarrow$", "LPIPS $\downarrow$", "DISTS $\downarrow$", "SSIM $\downarrow$", "DE2000 $\downarrow$", "PSNR $\downarrow$"]
+    latex_table = "\\begin{tabular}{l" + "c" * len(method_labels) + "}\n"
+    latex_table += "Metric & " + " & ".join(method_labels) + " \\\\\n"
+    latex_table += "\\hline\n"
+    for metric, row in zip(metric_labels, meanResultsT):
+        row_values = " & ".join(f"{v:.4f}" for v in row)
+        latex_table += f"{metric} & {row_values} \\\\\n"
+    latex_table += "\\end{tabular}"
+    print(latex_table)
+if __name__ == '__main__':
+    main()

extra/moMets-parallel-gopro.py ADDED Viewed

	@@ -0,0 +1,343 @@

+# Motion Metrics
+from concurrent.futures import ProcessPoolExecutor, as_completed
+import numpy as np
+np.float = np.float64
+np.int = np.int_
+import os
+from cdfvd import fvd
+from skimage.metrics import structural_similarity
+import torch
+import lpips
+#from DISTS_pytorch import DISTS
+#import colour as c
+#from torchmetrics.image.fid import FrechetInceptionDistance
+import torch.nn.functional as F
+from epe_metric import compute_bidirectional_epe as epe
+import pdb
+import multiprocessing
+import cv2
+import glob
+# init
+# dataDir = 'BaistCroppedOutput' # 'dataGoPro' #
+# gtDir = 'gt_subset' #'GT' #
+# methodDirs = ['deblurred', 'animation-from-blur', ] #['Favaro','MotionETR','Ours','GOPROGeneralize']  #
+# fType = '.mp4'
+# depth = 8
+# resFile = './resultsBaist20250521.npy'#resultsGoPro20250520.npy'#
+# patchDim = 32 #64 #
+# pixMax = 1.0
+# nMets = 7 # new results: scoreFVD, scorePWPSNR, scoreEPE, scorePatchSSIM, scorePatchLPIPS, scorePSNR
+# compute = True # if False, load previously computed
+# eps = 1e-8
+dataDir = 'GOPROResultsImages' # 'dataBaist' #
+gtDir = 'GT' #'gt' #
+methodDirs = ['Jin','MotionETR','Ours']  #'GOPROGeneralize',# ['animation-from-blur'] #
+depth = 8
+resFile = 'resultsGoPro20250521.npy'# './resultsBaist20250521.npy'#
+patchDim = 40 #32 #
+pixMax = 1.0
+nMets = 7 # new results: scoreFVD, scorePWPSNR, scoreEPE, scorePatchSSIM, scorePatchLPIPS, scorePSNR
+compute = False # if False, load previously computed
+eps = 1e-8
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+# Use 'spawn' to avoid CUDA context issues
+multiprocessing.freeze_support()              # on Windows
+multiprocessing.set_start_method('spawn', force=True)
+def read_pngs_to_array(path):
+    """Read all PNGs in `path`, sort them by filename, convert BGR→RGB, and stack into an np.ndarray."""
+    return np.stack([
+        cv2.imread(f, cv2.IMREAD_UNCHANGED)[..., ::-1]
+        for f in sorted(glob.glob(f"{path}/*.png"))
+    ])
+def compute_method(results_local, methodDir, files, countMethod):
+    fnLPIPS = lpips.LPIPS(net='alex').to(device)
+    #fnDISTS = DISTS().to(device)
+    fnFVD = fvd.cdfvd(model='videomae', device=device)
+    countFile = -1
+    for file in files:
+        countFile+=1
+        # pull frames from MP4
+        pathMethod = os.path.join(dataDir, methodDir, file)
+        framesMethod = np.clip(read_pngs_to_array(pathMethod).astype(np.float32) / (2**depth-1),0,1)
+        pathGT = os.path.join(dataDir, gtDir, file)
+        framesGT = np.clip(read_pngs_to_array(pathGT).astype(np.float32) / (2**depth-1),0,1)
+        #make sure the GT and method have the same shape
+        assert framesGT.shape == framesMethod.shape, f"GT shape {framesGT.shape} does not match method shape {framesMethod.shape} for file {file}"
+        # video metrics
+        # vmaf
+        #scoreVMAF = callVMAF(pathGT, pathMethod)
+        # epe - we have to change to tensors here
+        framesMethodTensor = torch.from_numpy(framesMethod)
+        framesGTtensor = torch.from_numpy(framesGT)
+        scoreEPE = epe(framesMethodTensor[0,:,:,:], framesMethodTensor[-1,:,:,:], framesGTtensor[0,:,:,:], framesGTtensor[-1,:,:,:], per_pixel_mode=True).cpu().detach().numpy()
+        # motion blur baseline
+        blurryGT = np.mean(framesGT ** 2.2,axis=0) ** (1/2.2)
+        blurryMethod = np.mean(framesMethod ** 2.2,axis=0) ** (1/2.2)
+        # MSE -> PSNR
+        mapBlurryMSE = (blurryGT - blurryMethod)**2
+        scoreBlurryMSE = np.mean(mapBlurryMSE)
+        scoreBlurryPSNR = (10 * np.log10(pixMax**2 / scoreBlurryMSE))
+        # fvd
+        #scoreFVD = fnFVD.compute_fvd(real_videos=(np.expand_dims(framesGT, axis=0)*(2**depth-1)).astype(np.uint8), fake_videos=(np.expand_dims(framesMethod, axis=0)*(2**depth-1)).astype(np.uint8))
+        framesGTfvd = np.expand_dims((framesGT * (2**depth-1)).astype(np.uint8), axis=0)
+        fnFVD.add_real_stats(framesGTfvd)
+        framesMethodFVD = np.expand_dims((framesMethod * (2**depth-1)).astype(np.uint8), axis=0)
+        fnFVD.add_fake_stats(framesMethodFVD)
+        # loop directions
+        framesMSE = np.stack((framesGT,framesGT)) # pre allocate array for directional PSNR maps
+        countDirect = -1
+        for direction in directions:
+            countDirect = countDirect+1
+            order = direction
+            # loop frames + image level metrics
+            countFrames = -1
+            for i in order:
+                countFrames+=1
+                frameMethod = framesMethod[i,:,:,:] # method frames can be re-ordered
+                frameGT =  framesGT[countFrames,:,:,:]
+                #assert patch size is divisible by image size
+                rows, cols, ch = frameGT.shape
+                assert rows % patchDim == 0, f"rows {rows} is not divisible by patchDim {patchDim}"
+                assert cols % patchDim == 0, f"cols {cols} is not divisible by patchDim {patchDim}"
+                rPatch = np.ceil(rows/patchDim)
+                cPatch = np.ceil(cols/patchDim)
+                # LPIPS
+                #pdb.set_trace()
+                methodTensor = (torch.from_numpy(np.moveaxis(frameMethod, -1, 0)).unsqueeze(0) * 2 - 1).to(device)
+                gtTensor = (torch.from_numpy(np.moveaxis(frameGT, -1, 0)).unsqueeze(0) * 2 - 1).to(device)
+                #scoreLPIPS = fnLPIPS(gtTensor, methodTensor).squeeze(0,1,2).cpu().detach().numpy()[0]
+                # FID
+                #fnFID.update((gtTensor * (2**depth - 1)).to(torch.uint8), real=True)
+                #fnFID.update((methodTensor * (2**depth - 1)).to(torch.uint8), real=False)
+                # DISTS
+                #scoreDISTS = fnDISTS(gtTensor.to(torch.float), methodTensor.to(torch.float), require_grad=True, batch_average=True).cpu().detach().numpy()
+                # compute ssim
+                #scoreSSIM = structural_similarity(frameGT, frameMethod, data_range=pixMax, channel_axis=2)
+                # compute DE 2000
+                #frameMethodXYZ = c.RGB_to_XYZ(frameMethod, c.models.RGB_COLOURSPACE_sRGB, apply_cctf_decoding=True)
+                #frameMethodLAB = c.XYZ_to_Lab(frameMethodXYZ)
+                #frameGTXYZ = c.RGB_to_XYZ(frameGT, c.models.RGB_COLOURSPACE_sRGB, apply_cctf_decoding=True)
+                #frameGTLAB = c.XYZ_to_Lab(frameGTXYZ)
+                #mapDE2000 = c.delta_E(frameGTLAB, frameMethodLAB, method='CIE 2000')
+                #scoreDE2000 = np.mean(mapDE2000)
+                # MSE
+                mapMSE = (frameGT - frameMethod)**2
+                scoreMSE = np.mean(mapMSE)
+                # PSNR
+                framesMSE[countDirect,countFrames,:,:,:] = mapMSE
+                #framesPSNR[countDirect,countFrames,:,:,:] = np.clip((10 * np.log10(pixMax**2 / np.clip(mapMSE,a_min=1e-10,a_max=None))),0,100)
+                scorePSNR = (10 * np.log10(pixMax**2 / scoreMSE))
+                #for l in range(ch):
+                    # channel-wise metrics
+                    #chanFrameMethod = frameMethod[:,:,l]
+                    #chanFrameGT = frameGT[:,:,l]
+                # loop patches rows
+                for j in range(int(rPatch)):
+                    # loop patches cols + patch level metrics
+                    for k in range(int(cPatch)):
+                        startR = j*patchDim
+                        startC = k*patchDim
+                        endR = j*patchDim+patchDim
+                        endC = k*patchDim+patchDim
+                        if endR > rows:
+                            endR = rows
+                        else:
+                            pass
+                        if endC > cols:
+                            endC = cols
+                        else:
+                            pass
+                        # patch metrics
+                        #patchMSE = np.mean(mapMSE[startR:endR,startC:endC,:])
+                        #scorePatchPSNR = np.clip((10 * np.log10(pixMax**2 / patchMSE)),0,100)
+                        if dataDir == 'BaistCroppedOutput':
+                            patchGtTensor = F.interpolate(gtTensor[:,:,startR:endR,startC:endC], scale_factor=2.0, mode='bilinear', align_corners=False)
+                            patchMethodTensor = F.interpolate(methodTensor[:,:,startR:endR,startC:endC], scale_factor=2.0, mode='bilinear', align_corners=False)
+                            scorePatchLPIPS = fnLPIPS(patchGtTensor, patchMethodTensor).squeeze(0,1,2).cpu().detach().numpy()[0]
+                        else:
+                            scorePatchLPIPS = fnLPIPS(gtTensor[:,:,startR:endR,startC:endC], methodTensor[:,:,startR:endR,startC:endC]).squeeze(0,1,2).cpu().detach().numpy()[0]
+                        scorePatchSSIM = structural_similarity(frameGT[startR:endR,startC:endC,:], frameMethod[startR:endR,startC:endC,:], data_range=pixMax, channel_axis=2)
+                        #scorePatchDISTS = fnDISTS(gtTensor[:,:,startR:endR,startC:endC].to(torch.float), methodTensor[:,:,startR:endR,startC:endC].to(torch.float), require_grad=True, batch_average=True).cpu().detach().numpy()
+                        #scorePatchDE2000 = np.mean(mapDE2000[startR:endR,startC:endC])
+                        # i: frame number, j: patch row, k: patch col
+                        #results[countMethod,countFile,countDirect,i,j,k,3:] = [scoreEPE, scoreBlurryPSNR, scoreLPIPS, scoreDISTS, scoreSSIM, scoreDE2000, scorePSNR, scorePatchPSNR, scorePatchSSIM, scorePatchLPIPS, scorePatchDISTS, scorePatchDE2000]
+                        results_local[countMethod,countFile,countDirect,i,j,k,2:] = [scoreEPE, scoreBlurryPSNR, scorePatchSSIM, scorePatchLPIPS, scorePSNR]
+                        print('Method: ', methodDir, ' File: ', file, ' Frame: ', str(i), ' PSNR: ', scorePSNR,  end='\r')
+                        #print('VMAF: ', str(scoreVMAF), ' FVD: ', str(scoreFVD), ' LPIPS: ', str(scoreLPIPS), ' FID: ', str(scoreFID), ' DISTS: ', str(scoreDISTS), ' SSIM: ', str(scoreSSIM), ' DE2000: ', str(scoreDE2000), ' PSNR: ', str(scorePSNR), ' Patch PSNR: ', str(scorePatchPSNR), end='\r')
+        #pdb.set_trace()
+        scorePWPSNR = (10 * np.log10(pixMax**2 / np.mean(np.min(np.mean(framesMSE, axis=(1)),axis=0)))) # take max pixel wise PSNR per direction, average over image dims
+        #print('Method: ', methodDir, ' File: ', file, ' Frame: ', str(i), ' PWPSNR: ', scorePWPSNR,  end='\n')
+        #scorePWPSNR = np.clip((10 * np.log10(pixMax**2 / np.mean(np.min(framesPSNR, axis=0),axis=(1,2,3)))),0,100) # take max pixel wise PSNR per direction, average over image dims
+        results_local[countMethod,countFile,:,:,:,:,1] = np.tile(scorePWPSNR, results_local.shape[2:-1])#np.broadcast_to(scorePWPSNR[:, np.newaxis, np.newaxis], results.shape[3:-1])
+        np.save(resFile, results_local) # save part of the way through the loop ..
+    #scoreFID = fnFID.compute().cpu().detach().numpy()
+    #fnFID.reset()
+    #results[countMethod,:,:,:,:,:,0] = np.tile(scoreFID, results.shape[1:-1])
+    scoreFVD = fnFVD.compute_fvd_from_stats()
+    fnFVD.empty_real_stats()
+    fnFVD.empty_fake_stats()
+    results_local[countMethod,:,:,:,:,:,0] = np.tile(scoreFVD, results_local.shape[1:-1])
+    print('Results computed .. analyzing ..')
+    return results_local
+    # init results matrix
+path = os.path.join(dataDir, gtDir)
+clipDirs = [name for name in os.listdir(path) if os.path.isdir(os.path.join(path, name))]
+files = []
+if dataDir == 'BaistCroppedOutput':
+    extraFknDir = 'blur'
+else:
+    extraFknDir = ''
+for clipDir in clipDirs:
+    path = os.path.join(dataDir, gtDir, clipDir, extraFknDir)
+    files = files + [os.path.join(clipDir,extraFknDir,name) for name in os.listdir(path)]
+files = sorted(files)
+path = os.path.join(dataDir, methodDirs[0], files[0])
+testFileGT = read_pngs_to_array(path)
+frams,rows,cols,ch = testFileGT.shape
+framRange = [i for i in range(frams)]
+directions = [framRange, framRange[::-1]]
+#loop through all methods and make sure they all have the same directory structure and same number of files
+for methodDir in methodDirs:
+    path = os.path.join(dataDir, methodDir)
+    clipDirs = [name for name in os.listdir(path) if os.path.isdir(os.path.join(path, name))]
+    filesMethod = []
+    for clipDir in clipDirs:
+        path = os.path.join(dataDir, methodDir, clipDir, extraFknDir)
+        filesMethod = filesMethod + [os.path.join(clipDir,extraFknDir,name) for name in os.listdir(path)]
+    filesMethod = sorted(filesMethod)
+    print('Method: ', methodDir, ' Number of files: ', len(filesMethod))
+    assert len(files) == len(filesMethod), f"Number of files in {methodDir} does not match GT number of files"
+    assert files == filesMethod, f"Files in {methodDir} do not match GT files"
+def main():
+    results = np.zeros((len(methodDirs),len(files),len(directions),frams,int(np.ceil(rows/patchDim)),int(np.ceil(cols/patchDim)),nMets))
+    if compute:
+        # loop methods + compute dataset level metrics (after nested for loops)
+        import multiprocessing as mp
+        ctx = mp.get_context('spawn')
+        with ProcessPoolExecutor(mp_context=ctx, max_workers=len(methodDirs)) as executor:
+            # submit one job per method
+            futures = {
+                executor.submit(compute_method, np.copy(results), md, files, idx): idx
+                for idx, md in enumerate(methodDirs)
+            }
+            # collect and merge results as they finish
+            for fut in as_completed(futures):
+                idx = futures[fut]
+                res_local = fut.result()
+                results[idx] = res_local[idx]
+    else:
+        results = np.load(resFile)
+    np.save(resFile, results)
+    # analyze
+    # new results: scoreFID, scoreFVD, scorePWPSNR, scoreEPE, scoreLPIPS, scoreDISTS, scoreSSIM, scoreDE2000, scorePSNR, scorePatchPSNR, scorePatchSSIM, scorePatchLPIPS, scorePatchDISTS, scorePatchDE2000
+    upMetrics = [1,3,4,6]
+    # 0508 results: scoreFID, scoreFVD, scoreLPIPS, scoreDISTS, scoreSSIM, scoreDE2000, scorePSNR, scorePatchPSNR, scorePatchSSIM, scorePatchLPIPS, scorePatchDISTS, scorePatchDE2000
+    #upMetrics = [4,6,7,8] # PSNR, SSIM, Patch PSNR, Patch SSIM
+    print("Results shape 1: ", results.shape)
+    forwardBackwardResults = np.mean(results,axis=(3))
+    #print("Results shape 2: ", forwardResults.shape)
+    maxDirResults = np.max(forwardBackwardResults,axis=(2))
+    minDirResults = np.min(forwardBackwardResults,axis=(2))
+    bestDirResults = minDirResults
+    #pdb.set_trace()
+    bestDirResults[:,:,:,:,upMetrics] = maxDirResults[:,:,:,:,upMetrics]
+    import pdb
+    #pdb.set_trace()
+    meanResults = bestDirResults.mean(axis=(1, 2, 3))  # Shape becomes (3, 6)
+    meanResultsT = meanResults.T
+    '''
+    maxDirResults = np.max(results,axis=2)
+    minDirResults = np.min(results,axis=2)
+    bestDirResults = minDirResults
+    bestDirResults[:,:,:,:,:,upMetrics] = maxDirResults[:,:,:,:,:,upMetrics]
+    meanResults = bestDirResults.mean(axis=(1, 2, 3, 4))  # Shape becomes (3, 6)
+    meanResultsT = meanResults.T
+    '''
+    #
+    #meanResults = forwardResults.mean(axis=(1, 2, 3, 4))  # Shape becomes (3, 6)
+    #meanResultsT = meanResults.T
+    # print latex table
+    method_labels = methodDirs
+    # results 0508: scoreLPIPS, scoreDISTS, scoreSSIM, scoreDE2000, scorePSNR, scorePatchPSNR, scorePatchSSIM, scorePatchLPIPS, scorePatchDISTS, scoreFID, scoreFVD
+    # metric_labels = ["FID $\downarrow$","FVD $\downarrow$","LPIPS $\downarrow$", "DISTS $\downarrow$", "SSIM $\downarrow$", "DE2000 $\downarrow$", "PSNR $\downarrow$", "Patch PSNR $\downarrow$", "Patch SSIM $\downarrow$",  "Patch LPIPS $\downarrow$", "Patch DISTS $\downarrow$", "Patch DE2000 $\downarrow$"]
+    # results 0517:
+    # metric_labels = ["FID $\downarrow$","FVD $\downarrow$","PWPSNR $\downarrow$","EPE $\downarrow$","BlurryPSNR $\downarrow$", "LPIPS $\downarrow$", "DISTS $\downarrow$", "SSIM $\downarrow$", "DE2000 $\downarrow$", "PSNR $\downarrow$", "Patch PSNR $\downarrow$", "Patch SSIM $\downarrow$",  "Patch LPIPS $\downarrow$", "Patch DISTS $\downarrow$", "Patch DE2000 $\downarrow$"]
+    # results 0518:
+    metric_labels = ["FVD $\downarrow$","PWPSNR $\downarrow$","EPE $\downarrow$","BlurryPSNR $\downarrow$","Patch SSIM $\downarrow$","Patch LPIPS $\downarrow$", "PSNR $\downarrow$"]
+    # appropriate for results 0507
+    #metric_labels = ["FID $\downarrow$", "FVD $\downarrow$", "LPIPS $\downarrow$", "DISTS $\downarrow$", "SSIM $\downarrow$", "DE2000 $\downarrow$", "PSNR $\downarrow$"]
+    latex_table = "\\begin{tabular}{l" + "c" * len(method_labels) + "}\n"
+    latex_table += "Metric & " + " & ".join(method_labels) + " \\\\\n"
+    latex_table += "\\hline\n"
+    for metric, row in zip(metric_labels, meanResultsT):
+        row_values = " & ".join(f"{v:.4f}" for v in row)
+        latex_table += f"{metric} & {row_values} \\\\\n"
+    latex_table += "\\end{tabular}"
+    print(latex_table)
+if __name__ == '__main__':
+    main()

gradio/app.py ADDED Viewed

	@@ -0,0 +1,85 @@

+import os
+import uuid
+from pathlib import Path
+import gradio as gr
+from PIL import Image
+# -----------------------
+# 1. Load your model here
+# -----------------------
+# Example:
+# from my_model_lib import MyVideoModel
+# model = MyVideoModel.from_pretrained("your/model/hub/id")
+OUTPUT_DIR = Path("/tmp/generated_videos")
+OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
+def generate_video_from_image(image: Image.Image) -> str:
+    video_id = uuid.uuid4().hex
+    output_path = OUTPUT_DIR / f"{video_id}.mp4"
+    # 1. Preprocess image
+    # img_tensor = preprocess(image)  # your code
+    # 2. Run model
+    # frames = model(img_tensor)  # e.g. np.ndarray of shape (T, H, W, 3), dtype=uint8
+    # 3. Save frames to video
+    # iio.imwrite(
+    #     uri=output_path,
+    #     image=frames,
+    #     fps=16,
+    #     codec="h264",
+    # )
+    return str(output_path)
+def demo_predict(image: Image.Image) -> str:
+    """
+    Wrapper for Gradio. Takes an image and returns a video path.
+    """
+    if image is None:
+        raise gr.Error("Please upload an image first.")
+    video_path = generate_video_from_image(image)
+    if not os.path.exists(video_path):
+        raise gr.Error("Video generation failed: output file not found.")
+    return video_path
+with gr.Blocks(css="footer {visibility: hidden}") as demo:
+    gr.Markdown(
+        """
+        # 🖼️ ➜ 🎬 Recover motion from a blurry image!
+        Upload an image and the model will generate a short video.
+        """
+    )
+    with gr.Row():
+        with gr.Column():
+            image_in = gr.Image(
+                type="pil",
+                label="Input image",
+                interactive=True,
+            )
+            generate_btn = gr.Button("Generate video", variant="primary")
+        with gr.Column():
+            video_out = gr.Video(
+                label="Generated video",
+                format="mp4",  # ensures browser-friendly output
+                autoplay=True,
+            )
+    generate_btn.click(
+        fn=demo_predict,
+        inputs=image_in,
+        outputs=video_out,
+        api_name="predict",
+    )
+if __name__ == "__main__":
+    demo.launch()

inference.py ADDED Viewed

	@@ -0,0 +1,317 @@

+# Copyright 2024 The HuggingFace Team.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+from pathlib import Path
+import io
+import yaml
+from PIL import Image, ImageCms
+import torch
+import numpy as np
+from transformers import T5Tokenizer, T5EncoderModel
+from safetensors.torch import load_file
+import diffusers
+from diffusers import AutoencoderKLCogVideoX, CogVideoXDPMScheduler
+from diffusers.utils import check_min_version, export_to_video
+from controlnet_pipeline import ControlnetCogVideoXPipeline
+from cogvideo_transformer import CogVideoXTransformer3DModel
+from training.utils import save_frames_as_pngs
+from training.helpers import get_conditioning
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.31.0.dev0")
+def convert_to_srgb(img: Image):
+    if 'icc_profile' in img.info:
+        icc = img.info['icc_profile']
+        src_profile = ImageCms.ImageCmsProfile(io.BytesIO(icc))
+        dst_profile = ImageCms.createProfile("sRGB")
+        img = ImageCms.profileToProfile(img, src_profile, dst_profile, outputMode='RGB')
+    else:
+        img = img.convert("RGB")  # Assume sRGB
+    return img
+INTERVALS = {
+    "present": {
+        "in_start": 0,
+        "in_end": 16,
+        "out_start": 0,
+        "out_end": 16,
+        "center": 8,
+        "window_size": 16,
+        "mode": "1x",
+        "fps": 240
+    },
+    "past_present_and_future": {
+        "in_start": 4,
+        "in_end": 12,
+        "out_start": 0,
+        "out_end": 16,
+        "center": 8,
+        "window_size": 16,
+        "mode": "2x",
+        "fps": 240,
+    },
+}
+def convert_to_batch(
+    image,
+    interval_key="present",
+    image_size=(720, 1280),
+):
+    interval = INTERVALS[interval_key]
+    inp_int, out_int, num_frames = get_conditioning(
+        in_start=interval['in_start'],
+        in_end=interval['in_end'],
+        out_start=interval['out_start'],
+        out_end=interval['out_end'],
+        mode=interval['mode'],
+        fps=interval['fps'],
+    )
+    blur_img_original = convert_to_srgb(image)
+    H, W = blur_img_original.size
+    blur_img = blur_img_original.resize((image_size[1], image_size[0])) # pil is width, height
+    blur_img = torch.from_numpy(np.array(blur_img)[None]).permute(0, 3, 1, 2).contiguous().float()
+    blur_img = blur_img / 127.5 - 1.0
+    data = {
+        "original_size": (H, W),
+        'blur_img': blur_img,
+        'caption': "",
+        'input_interval': inp_int,
+        'output_interval': out_int,
+        'height': image_size[0],
+        'width': image_size[1],
+        'num_frames': num_frames,
+    }
+    return data
+def load_model(args):
+    with open(args.model_config_path) as f:
+        model_config = yaml.safe_load(f)
+    load_dtype = torch.float16
+    transformer = CogVideoXTransformer3DModel.from_pretrained(
+        args.pretrained_model_path,
+        subfolder="transformer",
+        torch_dtype=load_dtype,
+        revision=model_config["revision"],
+        variant=model_config["variant"],
+        low_cpu_mem_usage=False,
+    )
+    transformer.load_state_dict(load_file(args.weight_path))
+    text_encoder = T5EncoderModel.from_pretrained(
+        args.pretrained_model_path,
+        subfolder="text_encoder",
+        revision=model_config["revision"],
+    )
+    tokenizer = T5Tokenizer.from_pretrained(
+        args.pretrained_model_path,
+        subfolder="tokenizer",
+        revision=model_config["revision"],
+    )
+    vae = AutoencoderKLCogVideoX.from_pretrained(
+        args.pretrained_model_path,
+        subfolder="vae",
+        revision=model_config["revision"],
+        variant=model_config["variant"],
+    )
+    scheduler = CogVideoXDPMScheduler.from_pretrained(
+        args.pretrained_model_path,
+        subfolder="scheduler"
+    )
+    # Enable slicing or tiling if VRAM is low!
+    vae.enable_slicing()
+    vae.enable_tiling()
+    # For mixed precision training we cast all non-trainable weights (vae, text_encoder and transformer) to half-precision
+    # as these weights are only used for inference, keeping weights in full precision is not required.
+    weight_dtype = torch.bfloat16
+    text_encoder.to(args.device, dtype=weight_dtype)
+    transformer.to(args.device, dtype=weight_dtype)
+    vae.to(args.device, dtype=weight_dtype)
+    pipe = ControlnetCogVideoXPipeline.from_pretrained(
+        args.pretrained_model_path,
+        tokenizer=tokenizer,
+        transformer=transformer,
+        text_encoder=text_encoder,
+        vae=vae,
+        scheduler=scheduler,
+        torch_dtype=weight_dtype,
+    )
+    scheduler_args = {}
+    if "variance_type" in pipe.scheduler.config:
+        variance_type = pipe.scheduler.config.variance_type
+        if variance_type in ["learned", "learned_range"]:
+            variance_type = "fixed_small"
+        scheduler_args["variance_type"] = variance_type
+    pipe.scheduler = CogVideoXDPMScheduler.from_config(pipe.scheduler.config, **scheduler_args)
+    pipe = pipe.to(args.device)
+    return pipe, model_config
+def inference_on_image(pipe, image, interval_key, model_config, args):
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        np.random.seed(args.seed)
+        torch.manual_seed(args.seed)
+    # run inference
+    generator = torch.Generator(device=args.device).manual_seed(args.seed) if args.seed else None
+    with torch.autocast(args.device, enabled=True):
+        batch = convert_to_batch(image, interval_key, (args.video_height, args.video_width))
+        frame = batch["blur_img"].permute(0, 2, 3, 1).cpu().numpy()
+        frame = (frame + 1.0) * 127.5
+        frame = frame.astype(np.uint8)
+        pipeline_args = {
+            "prompt": "",
+            "negative_prompt": "",
+            "image": frame,
+            "input_intervals": torch.stack([batch["input_interval"]]),
+            "output_intervals": torch.stack([batch["output_interval"]]),
+            "guidance_scale": model_config["guidance_scale"],
+            "use_dynamic_cfg": model_config["use_dynamic_cfg"],
+            "height": batch["height"],
+            "width": batch["width"],
+            "num_frames": torch.tensor([[model_config["max_num_frames"]]]), # torch.tensor([[batch["num_frames"]]]),
+            "num_inference_steps": model_config["num_inference_steps"],
+        }
+        input_image = frame
+        num_frames = batch["num_frames"]  # this is the actual number of frames, the video generation is padded by one frame
+        print(f"Running inference for interval {interval_key}...")
+        video = pipe(**pipeline_args, generator=generator, output_type="np").frames[0]
+        video = video[0:num_frames]
+    return input_image, video
+def main(args):
+    output_path = Path(args.output_path)
+    output_path.mkdir(exist_ok=True)
+    image_path = Path(args.image_path)
+    is_dir = image_path.is_dir()
+    if is_dir:
+        image_paths = sorted(list(image_path.glob("*.*")))
+    else:
+        image_paths = [image_path]
+    pipe, model_config = load_model(args)
+    for image_path in image_paths:
+        image = Image.open(image_path)
+        processed_image, video = inference_on_image(pipe, image, "past_present_and_future", model_config, args)
+        vid_output_path = output_path / f"{image_path.stem}.mp4"
+        export_to_video(video, vid_output_path, fps=20)
+        # save input image as well
+        inpug_image_output_path = output_path / f"{image_path.stem}_input.png"
+        Image.fromarray(processed_image[0]).save(inpug_image_output_path)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--image_path",
+        type=str,
+        required=True,
+        help="Path to image input or directory containing input images",
+    )
+    parser.add_argument(
+        "--weight_path",
+        type=str,
+        default="training/cogvideox-outsidephotos/checkpoint/model.safetensors",
+        help="directory containing weight files",
+    )
+    parser.add_argument(
+        "--pretrained_model_path",
+        type=str,
+        default="THUDM/CogVideoX-2b",
+        help="repo id or path for pretrained CogVideoX model",
+    )
+    parser.add_argument(
+        "--model_config_path",
+        type=str,
+        default="training/configs/outsidephotos.yaml",
+        help="path to model config yaml",
+    )
+    parser.add_argument(
+        "--output_path",
+        type=str,
+        required=True,
+        help="path to output",
+    )
+    parser.add_argument(
+        "--video_width",
+        type=int,
+        default=1280,
+        help="video resolution width",
+    )
+    parser.add_argument(
+        "--video_height",
+        type=int,
+        default=720,
+        help="video resolution height",
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=None,
+        help="random generator seed",
+    )
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="cuda",
+        help="inference device",
+    )
+    args = parser.parse_args()
+    main(args)
+# python inference.py --image_path assets/dummy_image.png --output_path output/

requirements.txt ADDED Viewed

	@@ -0,0 +1,22 @@

+spaces>=0.29.3
+safetensors>=0.4.5
+spandrel>=0.4.0
+tqdm>=4.66.5
+scikit-video>=1.1.11
+git+https://github.com/huggingface/diffusers.git@main
+transformers>=4.44.0
+accelerate>=0.34.2
+opencv-python>=4.10.0.84
+sentencepiece>=0.2.0
+numpy==1.26.0
+torch>=2.4.0
+torchvision>=0.19.0
+gradio>=4.44.0
+imageio>=2.34.2
+imageio-ffmpeg>=0.5.1
+openai>=1.45.0
+moviepy>=1.0.3
+pillow==9.5.0
+denku==0.0.51
+controlnet-aux==0.0.9
+gradio>=4.44.0

setup/download_checkpoints.py ADDED Viewed

	@@ -0,0 +1,53 @@

+from huggingface_hub import snapshot_download
+import os
+import sys
+# Make sure HF_TOKEN is set in your env beforehand:
+# export HF_TOKEN=your_hf_token
+#get first command line argument
+mode = sys.argv[1] if len(sys.argv) > 1 else "outsidephotos"
+REPO_ID = "tedlasai/blur2vid"
+REPO_TYPE = "model"
+# These are the subfolders you previously used as path_in_repo
+if mode == "outsidephotos":
+    checkpoints = [
+        "cogvideox-outsidephotos",
+    ]
+elif mode == "gopro":
+    checkpoints = [
+        "cogvideox-gopro-test",
+        "cogvideox-gopro-2x-test",
+    ]
+elif mode == "baist":
+    checkpoints = [
+        "cogvideox-baist-test",
+    ]
+elif mode == "full":
+    checkpoints = [
+        "cogvideox-baist-test",
+        "cogvideox-gopro-test",
+        "cogvideox-gopro-2x-test",
+        "cogvideox-full-test",
+        "cogvideox-outsidephotos",
+    ]
+# This is the root local directory where you want everything saved
+#get path of this file
+LOCAL_TRAINING_ROOT = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "training")
+os.makedirs(LOCAL_TRAINING_ROOT, exist_ok=True)
+# Download only those folders from the repo and place them under LOCAL_TRAINING_ROOT
+snapshot_download(
+    repo_id=REPO_ID,
+    repo_type=REPO_TYPE,
+    local_dir=LOCAL_TRAINING_ROOT,
+    local_dir_use_symlinks=False,
+    allow_patterns=[f"{name}/*" for name in checkpoints],
+    token=os.getenv("HF_TOKEN"),
+)
+print(f"Done! Checkpoints downloaded under: {LOCAL_TRAINING_ROOT}")

setup/download_cogvideo_weights.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from huggingface_hub import snapshot_download
+# Download the entire model repository and store it locally
+model_path = snapshot_download(repo_id="THUDM/CogVideoX-2b", cache_dir="./CogVideoX-2b")
+print(f"Model downloaded to: {model_path}")

setup/environment.yaml ADDED Viewed

	@@ -0,0 +1,225 @@

+name: blur2vid
+channels:
+  - conda-forge
+  - defaults
+dependencies:
+  - _libgcc_mutex=0.1=main
+  - _openmp_mutex=5.1=1_gnu
+  - asttokens=3.0.0=pyhd8ed1ab_1
+  - bzip2=1.0.8=h5eee18b_6
+  - ca-certificates=2025.4.26=hbd8a1cb_0
+  - comm=0.2.2=pyhd8ed1ab_1
+  - debugpy=1.6.0=py310hd8f1fbe_0
+  - entrypoints=0.4=pyhd8ed1ab_1
+  - exceptiongroup=1.2.2=pyhd8ed1ab_1
+  - executing=2.2.0=pyhd8ed1ab_0
+  - ffmpeg=4.3.2=hca11adc_0
+  - freetype=2.10.4=h0708190_1
+  - gmp=6.2.1=h58526e2_0
+  - gnutls=3.6.13=h85f3911_1
+  - ipykernel=6.20.2=pyh210e3f2_0
+  - ipython=8.36.0=pyh907856f_0
+  - jedi=0.19.2=pyhd8ed1ab_1
+  - jupyter_client=7.3.4=pyhd8ed1ab_0
+  - jupyter_core=5.7.2=pyh31011fe_1
+  - lame=3.100=h7f98852_1001
+  - ld_impl_linux-64=2.40=h12ee557_0
+  - libevent=2.1.12=hdbd6064_1
+  - libffi=3.4.4=h6a678d5_1
+  - libgcc-ng=11.2.0=h1234567_1
+  - libgomp=11.2.0=h1234567_1
+  - libpng=1.6.37=h21135ba_2
+  - libsodium=1.0.18=h36c2ea0_1
+  - libstdcxx-ng=11.2.0=h1234567_1
+  - libuuid=1.41.5=h5eee18b_0
+  - matplotlib-inline=0.1.7=pyhd8ed1ab_1
+  - ncurses=6.4=h6a678d5_0
+  - nest-asyncio=1.6.0=pyhd8ed1ab_1
+  - nettle=3.6=he412f7d_0
+  - openh264=2.1.1=h780b84a_0
+  - openssl=3.0.16=h5eee18b_0
+  - parso=0.8.4=pyhd8ed1ab_1
+  - pexpect=4.9.0=pyhd8ed1ab_1
+  - pickleshare=0.7.5=pyhd8ed1ab_1004
+  - pip=25.0=py310h06a4308_0
+  - platformdirs=4.3.7=pyh29332c3_0
+  - prompt-toolkit=3.0.51=pyha770c72_0
+  - ptyprocess=0.7.0=pyhd8ed1ab_1
+  - pure_eval=0.2.3=pyhd8ed1ab_1
+  - pygments=2.19.1=pyhd8ed1ab_0
+  - python=3.10.16=he870216_1
+  - python-dateutil=2.9.0.post0=pyhff2d567_1
+  - python_abi=3.10=2_cp310
+  - pyzmq=23.0.0=py310h330234f_0
+  - readline=8.2=h5eee18b_0
+  - setuptools=75.8.0=py310h06a4308_0
+  - six=1.17.0=pyhd8ed1ab_0
+  - sqlite=3.45.3=h5eee18b_0
+  - stack_data=0.6.3=pyhd8ed1ab_1
+  - tk=8.6.14=h39e8969_0
+  - tmux=3.3a=h5eee18b_1
+  - tornado=6.1=py310h5764c6d_3
+  - traitlets=5.14.3=pyhd8ed1ab_1
+  - typing_extensions=4.13.2=pyh29332c3_0
+  - wcwidth=0.2.13=pyhd8ed1ab_1
+  - wheel=0.45.1=py310h06a4308_0
+  - x264=1!161.3030=h7f98852_1
+  - xz=5.6.4=h5eee18b_1
+  - zeromq=4.3.4=h9c3ff4c_1
+  - zlib=1.2.13=h5eee18b_1
+  - pip:
+      - absl-py==2.2.0
+      - accelerate==1.5.2
+      - aiofiles==23.2.1
+      - aiohappyeyeballs==2.6.1
+      - aiohttp==3.12.14
+      - aiosignal==1.4.0
+      - annotated-types==0.7.0
+      - anyio==4.9.0
+      - async-timeout==5.0.1
+      - atomicwrites==1.4.1
+      - attrs==25.3.0
+      - beautifulsoup4==4.13.4
+      - certifi==2025.1.31
+      - cffi==1.17.1
+      - charset-normalizer==3.4.1
+      - click==8.1.8
+      - colour-science==0.4.6
+      - contourpy==1.3.1
+      - controlnet-aux==0.0.9
+      - cycler==0.12.1
+      - decorator==4.4.2
+      - decord==0.6.0
+      - denku==0.0.51
+      - diffusers==0.32.0
+      - distro==1.9.0
+      - docker-pycreds==0.4.0
+      - einops==0.8.1
+      - einops-exts==0.0.4
+      - fastapi==0.115.11
+      - ffmpeg-python==0.2.0
+      - ffmpy==0.5.0
+      - filelock==3.18.0
+      - flatbuffers==25.2.10
+      - fonttools==4.56.0
+      - frozenlist==1.7.0
+      - fsspec==2025.3.0
+      - future==1.0.0
+      - gdown==5.2.0
+      - gitdb==4.0.12
+      - gitpython==3.1.44
+      - gradio==5.22.0
+      - gradio-client==1.8.0
+      - groovy==0.1.2
+      - h11==0.14.0
+      - hf-transfer==0.1.9
+      - httpcore==1.0.7
+      - httpx==0.28.1
+      - huggingface-hub==0.29.3
+      - idna==3.10
+      - imageio==2.37.0
+      - imageio-ffmpeg==0.6.0
+      - importlib-metadata==8.6.1
+      - jax==0.5.3
+      - jaxlib==0.5.3
+      - jinja2==3.1.6
+      - jiter==0.9.0
+      - kiwisolver==1.4.8
+      - lazy-loader==0.4
+      - lightning==2.5.2
+      - lightning-utilities==0.14.3
+      - markdown-it-py==3.0.0
+      - markupsafe==3.0.2
+      - matplotlib==3.10.1
+      - mdurl==0.1.2
+      - mediapipe==0.10.21
+      - ml-dtypes==0.5.1
+      - moviepy==1.0.3
+      - mpmath==1.3.0
+      - multidict==6.6.3
+      - networkx==3.4.2
+      - numpy==1.26.0
+      - nvidia-cublas-cu12==12.4.5.8
+      - nvidia-cuda-cupti-cu12==12.4.127
+      - nvidia-cuda-nvrtc-cu12==12.4.127
+      - nvidia-cuda-runtime-cu12==12.4.127
+      - nvidia-cudnn-cu12==9.1.0.70
+      - nvidia-cufft-cu12==11.2.1.3
+      - nvidia-curand-cu12==10.3.5.147
+      - nvidia-cusolver-cu12==11.6.1.9
+      - nvidia-cusparse-cu12==12.3.1.170
+      - nvidia-cusparselt-cu12==0.6.2
+      - nvidia-ml-py==12.570.86
+      - nvidia-nccl-cu12==2.21.5
+      - nvidia-nvjitlink-cu12==12.4.127
+      - nvidia-nvtx-cu12==12.4.127
+      - nvitop==1.4.2
+      - openai==1.68.2
+      - opencv-contrib-python==4.11.0.86
+      - opencv-python==4.11.0.86
+      - opencv-python-headless==4.11.0.86
+      - opt-einsum==3.4.0
+      - orjson==3.10.15
+      - packaging==24.2
+      - pandas==2.2.3
+      - peft==0.15.0
+      - pillow==9.5.0
+      - proglog==0.1.10
+      - propcache==0.3.2
+      - protobuf==4.25.6
+      - psutil==5.9.8
+      - ptflops==0.7.4
+      - pycparser==2.22
+      - pydantic==2.10.6
+      - pydantic-core==2.27.2
+      - pydub==0.25.1
+      - pyparsing==3.2.1
+      - pysocks==1.7.1
+      - python-dotenv==1.0.1
+      - python-multipart==0.0.20
+      - pytorch-lightning==2.5.2
+      - pytz==2025.1
+      - pyyaml==6.0.2
+      - regex==2024.11.6
+      - requests==2.32.3
+      - rich==13.9.4
+      - ruff==0.11.2
+      - safehttpx==0.1.6
+      - safetensors==0.5.3
+      - scikit-image==0.24.0
+      - scikit-video==1.1.11
+      - scipy==1.15.2
+      - semantic-version==2.10.0
+      - sentencepiece==0.2.0
+      - sentry-sdk==2.24.0
+      - setproctitle==1.3.5
+      - shellingham==1.5.4
+      - smmap==5.0.2
+      - sniffio==1.3.1
+      - sounddevice==0.5.1
+      - soupsieve==2.7
+      - spaces==0.32.0
+      - spandrel==0.4.1
+      - starlette==0.46.1
+      - sympy==1.13.1
+      - tifffile==2025.3.13
+      - timm==0.6.7
+      - tokenizers==0.21.1
+      - tomlkit==0.13.2
+      - torch==2.6.0
+      - torch-fidelity==0.3.0
+      - torchmetrics==1.7.4
+      - torchvision==0.21.0
+      - tqdm==4.67.1
+      - transformers==4.50.0
+      - triton==3.2.0
+      - typer==0.15.2
+      - typing-extensions==4.12.2
+      - tzdata==2025.1
+      - urllib3==2.3.0
+      - uvicorn==0.34.0
+      - videoio==0.3.0
+      - wandb==0.19.8
+      - websockets==15.0.1
+      - yarl==1.20.1
+      - zipp==3.21.0

training/accelerator_configs/accelerate_test.py ADDED Viewed

	@@ -0,0 +1,17 @@

+# accelerate_test.py
+from accelerate import Accelerator
+import os
+print("MADE IT HERE")
+# Force unbuffered printing
+import sys; sys.stdout.reconfigure(line_buffering=True)
+acc = Accelerator()
+print(acc.num_processes )
+print(
+    f"[host {os.uname().nodename}] "
+    f"global rank {acc.process_index}/{acc.num_processes}, "
+    f"local rank {acc.local_process_index}"
+)
+# Print out assigned CUDA device
+print(f"Device: {acc.device}")

training/accelerator_configs/accelerator_multigpu.yaml ADDED Viewed

	@@ -0,0 +1,6 @@

+# Specify distributed_type as `MULTI_GPU` for DDP
+distributed_type: "MULTI_GPU"
+# Can be one of "no", "fp16", or "bf16" (see `transformer_engine.yaml` for `fp8`)
+mixed_precision: "bf16"
+# Specify the number of GPUs to use
+num_processes: 4

training/accelerator_configs/accelerator_multinode.yaml ADDED Viewed

	@@ -0,0 +1,4 @@

+distributed_type: "MULTI_GPU"
+mixed_precision: "bf16"
+num_processes: 16
+num_machines: 4

training/accelerator_configs/accelerator_singlegpu.yaml ADDED Viewed

	@@ -0,0 +1,25 @@

+compute_environment: LOCAL_MACHINE
+main_process_port: 29501
+debug: false
+deepspeed_config:
+  gradient_accumulation_steps: 1
+  gradient_clipping: 1.0
+  offload_optimizer_device: none
+  offload_param_device: none
+  zero3_init_flag: false
+  zero_stage: 2
+distributed_type: DEEPSPEED
+downcast_bf16: 'no'
+enable_cpu_affinity: false
+machine_rank: 0
+main_training_function: main
+dynamo_backend: 'no'
+mixed_precision: 'no'
+num_machines: 1
+num_processes: 1
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false

training/accelerator_configs/accelerator_val_config.yaml ADDED Viewed

	@@ -0,0 +1,25 @@

+compute_environment: LOCAL_MACHINE
+main_process_port: 29501
+debug: false
+deepspeed_config:
+  gradient_accumulation_steps: 1
+  gradient_clipping: 1.0
+  offload_optimizer_device: none
+  offload_param_device: none
+  zero3_init_flag: false
+  zero_stage: 2
+distributed_type: DEEPSPEED
+downcast_bf16: 'no'
+enable_cpu_affinity: false
+machine_rank: 0
+main_training_function: main
+dynamo_backend: 'no'
+mixed_precision: 'no'
+num_machines: 1
+num_processes: 4
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false

training/available-qos.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+      Name   Priority  GraceTime    Preempt   PreemptExemptTime PreemptMode                                    Flags UsageThres UsageFactor       GrpTRES   GrpTRESMins GrpTRESRunMin GrpJobs GrpSubmit     GrpWall       MaxTRES MaxTRESPerNode   MaxTRESMins     MaxWall     MaxTRESPU MaxJobsPU MaxSubmitPU     MaxTRESPA MaxJobsPA MaxSubmitPA       MinTRES
+---------- ---------- ---------- ---------- ------------------- ----------- ---------------------------------------- ---------- ----------- ------------- ------------- ------------- ------- --------- ----------- ------------- -------------- ------------- ----------- ------------- --------- ----------- ------------- --------- ----------- -------------
+    normal          0   00:00:00                                    cluster                                                        1.000000
+  gpu1-32h      10000   00:00:00  scavenger                         cluster                              DenyOnLimit               1.000000                                                                                                                     1-08:00:00 cpu=28,gres/+
+  gpu2-16h      10000   00:00:00  scavenger                         cluster                              DenyOnLimit               1.000000                                                                                                                       16:00:00 cpu=56,gres/+
+   gpu4-8h      10000   00:00:00  scavenger                         cluster                              DenyOnLimit               1.000000                                                                                                                       08:00:00 cpu=112,gres+
+   gpu8-4h      10000   00:00:00  scavenger                         cluster                              DenyOnLimit               1.000000                                                                                                                       04:00:00 cpu=224,gres+
+  gpu16-2h      10000   00:00:00  scavenger                         cluster                              DenyOnLimit               1.000000                                                                                                                       02:00:00 cpu=448,gres+
+  gpu32-1h      10000   00:00:00  scavenger                         cluster                              DenyOnLimit               1.000000                                                                                                                       01:00:00 cpu=896,gres+
+ scavenger          0   00:00:00                       01:00:00     cluster                                                        0.250000

training/configs/baist_test.yaml ADDED Viewed

	@@ -0,0 +1,77 @@

+# === Required or overridden ===
+base_dir: "/datasets/sai/gencam/blur2vid"
+pretrained_model_name_or_path: "CogVideoX-2b/models--THUDM--CogVideoX-2b/snapshots/1137dacfc2c9c012bed6a0793f4ecf2ca8e7ba01"  # Replace with actual path or env var expansion
+video_root_dir: "datasets/b-aist"
+csv_path: "set-path-to-csv-file"  # Replace with actual CSV path
+output_dir: "cogvideox-baist-test"
+tracker_name: "cogvideox-baist-test"
+# === Data-related ===
+stride_min: 1
+stride_max: 3
+hflip_p: 0.5
+downscale_coef: 8
+init_from_transformer: true
+dataloader_num_workers: 32
+val_split: "test"
+dataset: "baist"
+# === Validation ===
+num_inference_steps: 50
+validation_prompt: ""
+validation_video: "../resources/car.mp4:::../resources/ship.mp4"
+validation_prompt_separator: ":::"
+num_validation_videos: 1
+validation_steps: 400
+guidance_scale: 1.1
+use_dynamic_cfg: false
+just_validate: true
+special_info: "just_one"
+# === Training ===
+seed: 42
+mixed_precision: "bf16"
+height: 720
+width: 1280
+fps: 8
+max_num_frames: 17
+train_batch_size: 2
+num_train_epochs: 100
+max_train_steps: null
+checkpointing_steps: 200
+checkpoints_total_limit: null
+gradient_accumulation_steps: 1
+gradient_checkpointing: true
+learning_rate: 0.0001
+scale_lr: false
+lr_scheduler: "constant"
+lr_warmup_steps: 250
+lr_num_cycles: 1
+lr_power: 1.0
+enable_slicing: true
+enable_tiling: true
+# === Optimizer ===
+optimizer: "adamw"
+use_8bit_adam: false
+adam_beta1: 0.9
+adam_beta2: 0.95
+prodigy_beta3: null
+prodigy_decouple: false
+adam_weight_decay: 0.0001
+adam_epsilon: 0.0000001
+max_grad_norm: 1.0
+prodigy_use_bias_correction: false
+prodigy_safeguard_warmup: false
+# === Logging & Reporting ===
+push_to_hub: false
+hub_token: null
+hub_model_id: null
+logging_dir: "logs"
+allow_tf32: true
+report_to: null
+# === Optional HuggingFace model variant ===
+revision: null
+variant: null

training/configs/baist_train.yaml ADDED Viewed

	@@ -0,0 +1,78 @@

+# === Required or overridden ===
+base_dir: "/datasets/sai/gencam/blur2vid"
+pretrained_model_name_or_path: "CogVideoX-2b/models--THUDM--CogVideoX-2b/snapshots/1137dacfc2c9c012bed6a0793f4ecf2ca8e7ba01"  # Replace with actual path or env var expansion
+video_root_dir: "datasets/b-aist"
+csv_path: "set-path-to-csv-file"  # Replace with actual CSV path
+output_dir: "cogvideox-baist-train"
+tracker_name: "cogvideox-baist-train"
+# === Data-related ===
+stride_min: 1
+stride_max: 3
+hflip_p: 0.5
+downscale_coef: 8
+init_from_transformer: true
+dataloader_num_workers: 32
+val_split: "val"
+dataset: "baist"
+# === Validation ===
+num_inference_steps: 50
+validation_prompt: ""
+validation_video: "../resources/car.mp4:::../resources/ship.mp4"
+validation_prompt_separator: ":::"
+num_validation_videos: 1
+validation_steps: 400
+guidance_scale: 1.1
+use_dynamic_cfg: false
+just_validate: false
+special_info: "just_one"
+# === Training ===
+seed: 42
+mixed_precision: "bf16"
+height: 720
+width: 1280
+fps: 8
+max_num_frames: 17
+train_batch_size: 2
+num_train_epochs: 100
+max_train_steps: null
+checkpointing_steps: 200
+checkpoints_total_limit: null
+gradient_accumulation_steps: 1
+gradient_checkpointing: true
+learning_rate: 0.0001
+scale_lr: false
+lr_scheduler: "constant"
+lr_warmup_steps: 250
+lr_num_cycles: 1
+lr_power: 1.0
+enable_slicing: true
+enable_tiling: true
+# === Optimizer ===
+optimizer: "adamw"
+use_8bit_adam: false
+adam_beta1: 0.9
+adam_beta2: 0.95
+prodigy_beta3: null
+prodigy_decouple: false
+adam_weight_decay: 0.0001
+adam_epsilon: 0.0000001
+max_grad_norm: 1.0
+prodigy_use_bias_correction: false
+prodigy_safeguard_warmup: false
+# === Logging & Reporting ===
+push_to_hub: false
+hub_token: null
+hub_model_id: null
+logging_dir: "logs"
+allow_tf32: true
+report_to: null
+# === Optional HuggingFace model variant ===
+revision: null
+variant: null

training/configs/full_test.yaml ADDED Viewed

	@@ -0,0 +1,78 @@

+# === Required or overridden ===
+base_dir: "/datasets/sai/gencam/blur2vid"
+pretrained_model_name_or_path: "CogVideoX-2b/models--THUDM--CogVideoX-2b/snapshots/1137dacfc2c9c012bed6a0793f4ecf2ca8e7ba01"  # Replace with actual path or env var expansion
+video_root_dir: "datasets/FullDataset"
+csv_path: "set-path-to-csv-file"  # Replace with actual CSV path
+output_dir: "cogvideox-full-test"
+tracker_name: "cogvideox-full-test"
+# === Data-related ===
+stride_min: 1
+stride_max: 3
+hflip_p: 0.5
+downscale_coef: 8
+init_from_transformer: true
+dataloader_num_workers: 32
+val_split: "test"
+dataset: "full"
+# === Validation ===
+num_inference_steps: 50
+validation_prompt: ""
+validation_video: "../resources/car.mp4:::../resources/ship.mp4"
+validation_prompt_separator: ":::"
+num_validation_videos: 1
+validation_steps: 400
+guidance_scale: 1.1
+use_dynamic_cfg: false
+just_validate: true
+special_info: "just_one"
+# === Training ===
+seed: 42
+mixed_precision: "bf16"
+height: 720
+width: 1280
+fps: 8
+max_num_frames: 17
+train_batch_size: 2
+num_train_epochs: 200
+max_train_steps: null
+checkpointing_steps: 200
+checkpoints_total_limit: null
+gradient_accumulation_steps: 2
+gradient_checkpointing: true
+learning_rate: 0.0001
+scale_lr: false
+lr_scheduler: "constant"
+lr_warmup_steps: 250
+lr_num_cycles: 1
+lr_power: 1.0
+enable_slicing: true
+enable_tiling: true
+# === Optimizer ===
+optimizer: "adamw"
+use_8bit_adam: false
+adam_beta1: 0.9
+adam_beta2: 0.95
+prodigy_beta3: null
+prodigy_decouple: false
+adam_weight_decay: 0.0001
+adam_epsilon: 0.0000001
+max_grad_norm: 1.0
+prodigy_use_bias_correction: false
+prodigy_safeguard_warmup: false
+# === Logging & Reporting ===
+push_to_hub: false
+hub_token: null
+hub_model_id: null
+logging_dir: "logs"
+allow_tf32: true
+report_to: null
+# === Optional HuggingFace model variant ===
+revision: null
+variant: null

training/configs/full_train.yaml ADDED Viewed

	@@ -0,0 +1,78 @@

+# === Required or overridden ===
+base_dir: "/datasets/sai/gencam/blur2vid"
+pretrained_model_name_or_path: "CogVideoX-2b/models--THUDM--CogVideoX-2b/snapshots/1137dacfc2c9c012bed6a0793f4ecf2ca8e7ba01"  # Replace with actual path or env var expansion
+video_root_dir: "datasets/FullDataset"
+csv_path: "set-path-to-csv-file"  # Replace with actual CSV path
+output_dir: "cogvideox-full-train"
+tracker_name: "cogvideox-full-train"
+# === Data-related ===
+stride_min: 1
+stride_max: 3
+hflip_p: 0.5
+downscale_coef: 8
+init_from_transformer: true
+dataloader_num_workers: 2
+val_split: "val"
+dataset: "full"
+# === Validation ===
+num_inference_steps: 50
+validation_prompt: ""
+validation_video: "../resources/car.mp4:::../resources/ship.mp4"
+validation_prompt_separator: ":::"
+num_validation_videos: 1
+validation_steps: 400
+guidance_scale: 1.0
+use_dynamic_cfg: false
+just_validate: false
+special_info: "just_one"
+# === Training ===
+seed: 42
+mixed_precision: "bf16"
+height: 720
+width: 1280
+fps: 8
+max_num_frames: 17
+train_batch_size: 2
+num_train_epochs: 200
+max_train_steps: null
+checkpointing_steps: 200
+checkpoints_total_limit: null
+gradient_accumulation_steps: 2
+gradient_checkpointing: true
+learning_rate: 0.0001
+scale_lr: false
+lr_scheduler: "constant"
+lr_warmup_steps: 250
+lr_num_cycles: 1
+lr_power: 1.0
+enable_slicing: true
+enable_tiling: true
+# === Optimizer ===
+optimizer: "adamw"
+use_8bit_adam: false
+adam_beta1: 0.9
+adam_beta2: 0.95
+prodigy_beta3: null
+prodigy_decouple: false
+adam_weight_decay: 0.0001
+adam_epsilon: 0.0000001
+max_grad_norm: 1.0
+prodigy_use_bias_correction: false
+prodigy_safeguard_warmup: false
+# === Logging & Reporting ===
+push_to_hub: false
+hub_token: null
+hub_model_id: null
+logging_dir: "logs"
+allow_tf32: true
+report_to: null
+# === Optional HuggingFace model variant ===
+revision: null
+variant: null

training/configs/gopro_2x_test.yaml ADDED Viewed

	@@ -0,0 +1,78 @@

+# === Required or overridden ===
+base_dir: "/datasets/sai/gencam/blur2vid"
+pretrained_model_name_or_path: "CogVideoX-2b/models--THUDM--CogVideoX-2b/snapshots/1137dacfc2c9c012bed6a0793f4ecf2ca8e7ba01"  # Replace with actual path or env var expansion
+video_root_dir: "datasets/GOPRO_7"
+csv_path: "set-path-to-csv-file"  # Replace with actual CSV path
+output_dir: "cogvideox-gopro-2x-test"
+tracker_name: "cogvideox-gopro-2x-test"
+# === Data-related ===
+stride_min: 1
+stride_max: 3
+hflip_p: 0.5
+downscale_coef: 8
+init_from_transformer: true
+dataloader_num_workers: 32
+val_split: "test"
+dataset: "gopro2x"
+# === Validation ===
+num_inference_steps: 50
+validation_prompt: ""
+validation_video: "../resources/car.mp4:::../resources/ship.mp4"
+validation_prompt_separator: ":::"
+num_validation_videos: 1
+validation_steps: 400
+guidance_scale: 1.1
+use_dynamic_cfg: false
+just_validate: true
+special_info: "just_one"
+# === Training ===
+seed: 42
+mixed_precision: "bf16"
+height: 720
+width: 1280
+fps: 8
+max_num_frames: 17
+train_batch_size: 4
+num_train_epochs: 100
+max_train_steps: null
+checkpointing_steps: 400
+checkpoints_total_limit: null
+gradient_accumulation_steps: 1
+gradient_checkpointing: true
+learning_rate: 0.0001
+scale_lr: false
+lr_scheduler: "constant"
+lr_warmup_steps: 250
+lr_num_cycles: 1
+lr_power: 1.0
+enable_slicing: true
+enable_tiling: true
+# === Optimizer ===
+optimizer: "adamw"
+use_8bit_adam: false
+adam_beta1: 0.9
+adam_beta2: 0.95
+prodigy_beta3: null
+prodigy_decouple: false
+adam_weight_decay: 0.0001
+adam_epsilon: 0.0000001
+max_grad_norm: 1.0
+prodigy_use_bias_correction: false
+prodigy_safeguard_warmup: false
+# === Logging & Reporting ===
+push_to_hub: false
+hub_token: null
+hub_model_id: null
+logging_dir: "logs"
+allow_tf32: true
+report_to: null
+# === Optional HuggingFace model variant ===
+revision: null
+variant: null

training/configs/gopro_test.yaml ADDED Viewed

	@@ -0,0 +1,78 @@

+# === Required or overridden ===
+base_dir: "/datasets/sai/gencam/blur2vid"
+pretrained_model_name_or_path: "CogVideoX-2b/models--THUDM--CogVideoX-2b/snapshots/1137dacfc2c9c012bed6a0793f4ecf2ca8e7ba01"  # Replace with actual path or env var expansion
+video_root_dir: "datasets/GOPRO_7"
+csv_path: "set-path-to-csv-file"  # Replace with actual CSV path
+output_dir: "cogvideox-gopro-test"
+tracker_name: "cogvideox-gopro-test"
+# === Data-related ===
+stride_min: 1
+stride_max: 3
+hflip_p: 0.5
+downscale_coef: 8
+init_from_transformer: true
+dataloader_num_workers: 32
+val_split: "test"
+dataset: "gopro"
+# === Validation ===
+num_inference_steps: 50
+validation_prompt: ""
+validation_video: "../resources/car.mp4:::../resources/ship.mp4"
+validation_prompt_separator: ":::"
+num_validation_videos: 1
+validation_steps: 400
+guidance_scale: 1.1
+use_dynamic_cfg: false
+just_validate: true
+special_info: "just_one"
+# === Training ===
+seed: 42
+mixed_precision: "bf16"
+height: 720
+width: 1280
+fps: 8
+max_num_frames: 9
+train_batch_size: 4
+num_train_epochs: 500
+max_train_steps: null
+checkpointing_steps: 100
+checkpoints_total_limit: null
+gradient_accumulation_steps: 1
+gradient_checkpointing: true
+learning_rate: 0.0001
+scale_lr: false
+lr_scheduler: "constant"
+lr_warmup_steps: 250
+lr_num_cycles: 1
+lr_power: 1.0
+enable_slicing: true
+enable_tiling: true
+# === Optimizer ===
+optimizer: "adamw"
+use_8bit_adam: false
+adam_beta1: 0.9
+adam_beta2: 0.95
+prodigy_beta3: null
+prodigy_decouple: false
+adam_weight_decay: 0.0001
+adam_epsilon: 0.0000001
+max_grad_norm: 1.0
+prodigy_use_bias_correction: false
+prodigy_safeguard_warmup: false
+# === Logging & Reporting ===
+push_to_hub: false
+hub_token: null
+hub_model_id: null
+logging_dir: "logs"
+allow_tf32: true
+report_to: null
+# === Optional HuggingFace model variant ===
+revision: null
+variant: null

training/configs/gopro_train.yaml ADDED Viewed

	@@ -0,0 +1,77 @@

+# === Required or overridden ===
+base_dir: "/datasets/sai/gencam/blur2vid"
+pretrained_model_name_or_path: "CogVideoX-2b/models--THUDM--CogVideoX-2b/snapshots/1137dacfc2c9c012bed6a0793f4ecf2ca8e7ba01"  # Replace with actual path or env var expansion
+video_root_dir: "datasets/GOPRO_7"
+csv_path: "set-path-to-csv-file"  # Replace with actual CSV path
+output_dir: "cogvideox-gopro-train"
+tracker_name: "cogvideox-gopro-train"
+# === Data-related ===
+stride_min: 1
+stride_max: 3
+hflip_p: 0.5
+downscale_coef: 8
+init_from_transformer: true
+dataloader_num_workers: 2
+val_split: "val"
+dataset: "gopro"
+# === Validation ===
+num_inference_steps: 50
+validation_prompt: ""
+validation_video: "../resources/car.mp4:::../resources/ship.mp4"
+validation_prompt_separator: ":::"
+num_validation_videos: 1
+validation_steps: 100
+guidance_scale: 1.0
+use_dynamic_cfg: false
+just_validate: false
+special_info: "just_one"
+# === Training ===
+seed: 42
+mixed_precision: "bf16"
+height: 720
+width: 1280
+fps: 8
+max_num_frames: 9
+train_batch_size: 4
+num_train_epochs: 500
+max_train_steps: null
+checkpointing_steps: 100
+checkpoints_total_limit: null
+gradient_accumulation_steps: 1
+gradient_checkpointing: true
+learning_rate: 0.0001
+scale_lr: false
+lr_scheduler: "constant"
+lr_warmup_steps: 250
+lr_num_cycles: 1
+lr_power: 1.0
+enable_slicing: true
+enable_tiling: true
+# === Optimizer ===
+optimizer: "adamw"
+use_8bit_adam: false
+adam_beta1: 0.9
+adam_beta2: 0.95
+prodigy_beta3: null
+prodigy_decouple: false
+adam_weight_decay: 0.0001
+adam_epsilon: 0.0000001
+max_grad_norm: 1.0
+prodigy_use_bias_correction: false
+prodigy_safeguard_warmup: false
+# === Logging & Reporting ===
+push_to_hub: false
+hub_token: null
+hub_model_id: null
+logging_dir: "logs"
+allow_tf32: true
+report_to: null
+# === Optional HuggingFace model variant ===
+revision: null
+variant: null

training/configs/outsidephotos.yaml ADDED Viewed

	@@ -0,0 +1,76 @@

+# === Required or overridden ===
+base_dir: "/datasets/sai/gencam/blur2vid"
+pretrained_model_name_or_path: "cogvideox/CogVideoX-2b/models--THUDM--CogVideoX-2b/snapshots/1137dacfc2c9c012bed6a0793f4ecf2ca8e7ba01"  # Replace with actual path or env var expansion
+video_root_dir: "datasets/my_motion_blurred_images"
+csv_path: "set-path-to-csv-file"  # Replace with actual CSV path
+output_dir: "cogvideox-outsidephotos"
+tracker_name: "cogvideox-outsidephotos"
+# === Data-related ===
+stride_min: 1
+stride_max: 3
+hflip_p: 0.5
+downscale_coef: 8
+init_from_transformer: true
+dataloader_num_workers: 0
+val_split: "test"
+dataset: "outsidephotos"
+# === Validation ===
+num_inference_steps: 50
+just_validate: true
+validation_prompt: ""
+validation_video: "../resources/car.mp4:::../resources/ship.mp4"
+validation_prompt_separator: ":::"
+num_validation_videos: 1
+validation_steps: 100
+guidance_scale: 1.1
+use_dynamic_cfg: false
+# === Training ===
+seed: 42
+mixed_precision: "bf16"
+height: 720
+width: 1280
+fps: 8
+max_num_frames: 17
+train_batch_size: 1
+num_train_epochs: 100
+max_train_steps: null
+checkpointing_steps: 100
+checkpoints_total_limit: null
+gradient_accumulation_steps: 1
+gradient_checkpointing: true
+learning_rate: 0.0001
+scale_lr: false
+lr_scheduler: "constant"
+lr_warmup_steps: 250
+lr_num_cycles: 1
+lr_power: 1.0
+enable_slicing: true
+enable_tiling: true
+# === Optimizer ===
+optimizer: "adamw"
+use_8bit_adam: false
+adam_beta1: 0.9
+adam_beta2: 0.95
+prodigy_beta3: null
+prodigy_decouple: false
+adam_weight_decay: 0.0001
+adam_epsilon: 0.0000001
+max_grad_norm: 1.0
+prodigy_use_bias_correction: false
+prodigy_safeguard_warmup: false
+# === Logging & Reporting ===
+push_to_hub: false
+hub_token: null
+hub_model_id: null
+logging_dir: "logs"
+allow_tf32: true
+report_to: null
+# === Optional HuggingFace model variant ===
+revision: null
+variant: null

training/controlnet_datasets.py ADDED Viewed

	@@ -0,0 +1,735 @@

+import io
+import os
+import glob
+from pathlib import Path
+import pickle
+import random
+import time
+import cv2
+import torch
+import numpy as np
+import pandas as pd
+import torchvision.transforms as transforms
+from PIL import Image, ImageOps, ImageCms
+from decord import VideoReader
+from torch.utils.data.dataset import Dataset
+from controlnet_aux import CannyDetector, HEDdetector
+import torch.nn.functional as F
+from helpers import generate_1x_sequence, generate_2x_sequence, generate_large_blur_sequence, generate_test_case
+def unpack_mm_params(p):
+    if isinstance(p, (tuple, list)):
+        return p[0], p[1]
+    elif isinstance(p, (int, float)):
+        return p, p
+    raise Exception(f'Unknown input parameter type.\nParameter: {p}.\nType: {type(p)}')
+def resize_for_crop(image, min_h, min_w):
+    img_h, img_w = image.shape[-2:]
+    if img_h >= min_h and img_w >= min_w:
+        coef = min(min_h / img_h, min_w / img_w)
+    elif img_h <= min_h and img_w <=min_w:
+        coef = max(min_h / img_h, min_w / img_w)
+    else:
+        coef = min_h / img_h if min_h > img_h else min_w / img_w
+    out_h, out_w = int(img_h * coef), int(img_w * coef)
+    resized_image = transforms.functional.resize(image, (out_h, out_w), antialias=True)
+    return resized_image
+class BaseClass(Dataset):
+    def __init__(
+            self,
+            data_dir,
+            output_dir,
+            image_size=(320, 512),
+            hflip_p=0.5,
+            controlnet_type='canny',
+            split='train',
+            *args,
+            **kwargs
+        ):
+        self.split = split
+        self.height, self.width = unpack_mm_params(image_size)
+        self.data_dir = data_dir
+        self.output_dir = output_dir
+        self.hflip_p = hflip_p
+        self.image_size = image_size
+        self.length = 0
+    def __len__(self):
+        return self.length
+    def load_frames(self, frames):
+        # frames: numpy array of shape (N, H, W, C), 0–255
+        # → tensor of shape (N, C, H, W) as float
+        pixel_values = torch.from_numpy(frames).permute(0, 3, 1, 2).contiguous().float()
+        # normalize to [-1, 1]
+        pixel_values = pixel_values / 127.5 - 1.0
+        # resize to (self.height, self.width)
+        pixel_values = F.interpolate(
+            pixel_values,
+            size=(self.height, self.width),
+            mode="bilinear",
+            align_corners=False
+        )
+        return pixel_values
+    def get_batch(self, idx):
+        raise Exception('Get batch method is not realized.')
+    def __getitem__(self, idx):
+        while True:
+            try:
+                video, caption, motion_blur = self.get_batch(idx)
+                break
+            except Exception as e:
+                print(e)
+                idx = random.randint(0, self.length - 1)
+        video, = [
+            resize_for_crop(x, self.height, self.width) for x in [video]
+        ]
+        video, = [
+            transforms.functional.center_crop(x, (self.height, self.width)) for x in [video]
+        ]
+        data = {
+            'video': video,
+            'caption': caption,
+        }
+        return data
+def load_as_srgb(path):
+    img = Image.open(path)
+    img = ImageOps.exif_transpose(img)
+    if 'icc_profile' in img.info:
+        icc = img.info['icc_profile']
+        src_profile = ImageCms.ImageCmsProfile(io.BytesIO(icc))
+        dst_profile = ImageCms.createProfile("sRGB")
+        img = ImageCms.profileToProfile(img, src_profile, dst_profile, outputMode='RGB')
+    else:
+        img = img.convert("RGB")  # Assume sRGB
+    return img
+class GoProMotionBlurDataset(BaseClass): #7 frame go pro dataset
+    def __init__(self,
+                 *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        # Set blur and sharp directories based on split
+        if self.split == 'train':
+            self.blur_root = os.path.join(self.data_dir, 'train', 'blur')
+            self.sharp_root = os.path.join(self.data_dir, 'train', 'sharp')
+        elif self.split in ['val', 'test']:
+            self.blur_root = os.path.join(self.data_dir, 'test', 'blur')
+            self.sharp_root = os.path.join(self.data_dir, 'test', 'sharp')
+        else:
+            raise ValueError(f"Unsupported split: {self.split}")
+        # Collect all blurred image paths
+        pattern = os.path.join(self.blur_root, '*', '*.png')
+        self.blur_paths = sorted(glob.glob(pattern))
+        if self.split == 'val':
+            # Optional: limit validation subset
+            self.blur_paths = self.blur_paths[:5]
+        filtered_blur_paths = []
+        for path in self.blur_paths:
+            output_deblurred_dir = os.path.join(self.output_dir, "deblurred")
+            full_output_path = Path(output_deblurred_dir, *path.split('/')[-2:]).with_suffix(".mp4")
+            if not os.path.exists(full_output_path):
+                filtered_blur_paths.append(path)
+        # Window and padding parameters
+        self.window_size = 7              # original number of sharp frames
+        self.pad = 2                      # number of times to repeat last frame
+        self.output_length = self.window_size + self.pad
+        self.half_window = self.window_size // 2
+        self.length = len(self.blur_paths)
+        # Normalized input interval: always [-0.5, 0.5]
+        self.input_interval = torch.tensor([[-0.5, 0.5]], dtype=torch.float)
+        # Precompute normalized output intervals: first for window_size frames, then pad duplicates
+        step = 1.0 / (self.window_size - 1)
+        # intervals for the original 7 frames
+        window_intervals = []
+        for i in range(self.window_size):
+            start = -0.5 + i * step
+            if i < self.window_size - 1:
+                end = -0.5 + (i + 1) * step
+            else:
+                end = 0.5
+            window_intervals.append([start, end])
+        # append the last interval pad times
+        intervals = window_intervals + [window_intervals[-1]] * self.pad
+        self.output_interval = torch.tensor(intervals, dtype=torch.float)
+    def __len__(self):
+        return self.length
+    def __getitem__(self, idx):
+        # Path to the blurred (center) frame
+        blur_path = self.blur_paths[idx]
+        seq_name = os.path.basename(os.path.dirname(blur_path))
+        frame_name = os.path.basename(blur_path)
+        center_idx = int(os.path.splitext(frame_name)[0])
+        # Compute sharp frame range [center-half, center+half]
+        start_idx = center_idx - self.half_window
+        end_idx = center_idx + self.half_window
+        # Load sharp frames
+        sharp_dir = os.path.join(self.sharp_root, seq_name)
+        frames = []
+        for i in range(start_idx, end_idx + 1):
+            sharp_filename = f"{i:06d}.png"
+            sharp_path = os.path.join(sharp_dir, sharp_filename)
+            img = Image.open(sharp_path).convert('RGB')
+            frames.append(img)
+        # Repeat last sharp frame so total frames == output_length
+        while len(frames) < self.output_length:
+            frames.append(frames[-1])
+        # Load blurred image
+        blur_img = Image.open(blur_path).convert('RGB')
+        # Convert to pixel values via BaseClass loader
+        video = self.load_frames(np.array(frames))                    # shape: (output_length, H, W, C)
+        blur_input = self.load_frames(np.expand_dims(np.array(blur_img), 0))  # shape: (1, H, W, C)
+        end_time = time.time()
+        data = {
+            'file_name': os.path.join(seq_name, frame_name),
+            'blur_img': blur_input,
+            'video': video,
+            "caption": "",
+            'motion_blur_amount': torch.tensor(self.half_window, dtype=torch.long),
+            'input_interval': self.input_interval,
+            'output_interval': self.output_interval,
+            "num_frames": self.window_size,
+            "mode": "1x",
+        }
+        return data
+class OutsidePhotosDataset(BaseClass):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.image_paths = sorted(glob.glob(os.path.join(self.data_dir, '**', '*.*'), recursive=True))
+        INTERVALS = [
+                    {"in_start": 0, "in_end": 16, "out_start": 0, "out_end": 16, "center": 8, "window_size": 16, "mode": "1x", "fps": 240},
+                    {"in_start": 4, "in_end": 12, "out_start": 0, "out_end": 16, "center": 8, "window_size": 16, "mode": "2x", "fps": 240},]
+        #other modes commented out for faster processing
+        #{"in_start": 0, "in_end": 4, "out_start": 0, "out_end": 4, "center": 2, "window_size": 4, "mode": "1x", "fps": 240},
+        #{"in_start": 0, "in_end": 8, "out_start": 0, "out_end": 8, "center": 4, "window_size": 8, "mode": "1x", "fps": 240},
+        #{"in_start": 0, "in_end": 12, "out_start": 0, "out_end": 12, "center": 6, "window_size": 12, "mode": "1x", "fps": 240},
+        #{"in_start": 0, "in_end": 32, "out_start": 0, "out_end": 32, "center": 12, "window_size": 32, "mode": "lb", "fps": 120}
+        #{"in_start": 0, "in_end": 48, "out_start": 0, "out_end": 48, "center": 24, "window_size": 48, "mode": "lb", "fps": 80}
+        self.cleaned_intervals = []
+        for image_path in self.image_paths:
+            for interval in INTERVALS:
+                #create a copy of the interval dictionary
+                i = interval.copy()
+                #add the image path to the interval dictionary
+                i['video_name'] = image_path
+                video_name = i['video_name']
+                mode      = i['mode']
+                vid_name_w_extension = os.path.relpath(video_name, self.data_dir).split('.')[0]  # "frame_00000"
+                output_name = (
+                    f"{vid_name_w_extension}_{mode}.mp4"
+                )
+                full_output_path = os.path.join("/datasets/sai/gencam/cogvideox/training/cogvideox-outsidephotos/deblurred", output_name) #THIS IS A HACK - YOU NEED TO UPDATE THIS TO YOUR OUTPUT DIRECTORY
+                # Keep only if output doesn't exist
+                if not os.path.exists(full_output_path):
+                    self.cleaned_intervals.append(i)
+        self.length = len(self.cleaned_intervals)
+    def __len__(self):
+        return self.length
+    def __getitem__(self, idx):
+        interval = self.cleaned_intervals[idx]
+        in_start  = interval['in_start']
+        in_end    = interval['in_end']
+        out_start = interval['out_start']
+        out_end   = interval['out_end']
+        center = interval['center']
+        window = interval['window_size']
+        mode   = interval['mode']
+        fps    = interval['fps']
+        image_path = interval['video_name']
+        blur_img_original = load_as_srgb(image_path)
+        H,W = blur_img_original.size
+        frame_paths = []
+        frame_paths = ["../assets/dummy_image.png" for _ in range(window)] #any random path replicated
+        # generate test case
+        _, seq_frames, inp_int, out_int, high_fps_video, num_frames = generate_test_case(
+            frame_paths=frame_paths, window_max=window, in_start=in_start, in_end=in_end, out_start=out_start,out_end=out_end, center=center, mode=mode, fps=fps
+        )
+        file_name = image_path
+        # Get base directory and frame prefix
+        relative_file_name = os.path.relpath(file_name, self.data_dir)
+        base_dir = os.path.dirname(relative_file_name)
+        frame_stem = os.path.splitext(os.path.basename(file_name))[0]  # "frame_00000"
+        # Build new filename
+        new_filename = (
+            f"{frame_stem}_{mode}.png"
+        )
+        blur_img =blur_img_original.resize((self.image_size[1], self.image_size[0])) #cause pil is width, height
+        # Final path
+        relative_file_name = os.path.join(base_dir, new_filename)
+        blur_input = self.load_frames(np.expand_dims(blur_img, 0).copy())
+        # seq_frames is list of frames; stack along time dim
+        video = self.load_frames(np.stack(seq_frames, axis=0))
+        data = {
+            'file_name': relative_file_name,
+            "original_size": (H, W),
+            'blur_img': blur_input,
+            'video': video,
+            'caption': "",
+            'input_interval': inp_int,
+            'output_interval': out_int,
+            "num_frames": num_frames,
+        }
+        return data
+class FullMotionBlurDataset(BaseClass):
+    """
+    A dataset that randomly selects among 1×, 2×, or large-blur modes per sample.
+    Uses category-specific <split>_list.txt files under each subfolder of FullDataset to assemble sequences.
+    In 'test' split, it instead loads precomputed intervals from intervals_test.pkl and uses generate_test_case.
+    """
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.seq_dirs = []
+        # TEST split: load fixed intervals early
+        if self.split == 'test':
+            pkl_path = os.path.join(self.data_dir, 'intervals_test.pkl')
+            with open(pkl_path, 'rb') as f:
+                self.test_intervals = pickle.load(f)
+            assert self.test_intervals, f"No test intervals found in {pkl_path}"
+            cleaned_intervals = []
+            for interval in self.test_intervals:
+                # Extract interval values
+                in_start  = interval['in_start']
+                in_end    = interval['in_end']
+                out_start = interval['out_start']
+                out_end   = interval['out_end']
+                center    = interval['center']
+                window    = interval['window_size']
+                mode      = interval['mode']
+                fps       = interval['fps'] # e.g. "lower_fps_frames/720p_240fps_1/frame_00247.png"
+                category, seq = interval['video_name'].split('/')#.split('/')
+                seq_dir = os.path.join(self.data_dir, category, 'lower_fps_frames', seq)
+                frame_paths = sorted(glob.glob(os.path.join(seq_dir, '*.png')))
+                rel_path = os.path.relpath(frame_paths[center], self.data_dir)
+                rel_path = os.path.splitext(rel_path)[0] # remove the file extension
+                output_name = (
+                    f"{rel_path}_"
+                    f"in{in_start:04d}_ie{in_end:04d}_"
+                    f"os{out_start:04d}_oe{out_end:04d}_"
+                    f"ctr{center:04d}_win{window:04d}_"
+                    f"fps{fps:04d}_{mode}.mp4"
+                )
+                output_deblurred_dir = os.path.join(self.output_dir, "deblurred")
+                full_output_path = os.path.join(output_deblurred_dir, output_name)
+                # Keep only if output doesn't exist
+                if not os.path.exists(full_output_path):
+                    cleaned_intervals.append(interval)
+            print("Len of test intervals after cleaning: ", len(cleaned_intervals))
+            print("Len of test intervals before cleaning: ", len(self.test_intervals))
+            self.test_intervals = cleaned_intervals
+        # TRAIN/VAL: build seq_dirs from each category's list or fallback
+        list_file = 'train_list.txt' if self.split == 'train' else 'test_list.txt'
+        for category in sorted(os.listdir(self.data_dir)):
+            cat_dir = os.path.join(self.data_dir, category)
+            if not os.path.isdir(cat_dir):
+                continue
+            list_path = os.path.join(cat_dir, list_file)
+            if os.path.isfile(list_path):
+                with open(list_path, 'r') as f:
+                    for line in f:
+                        rel = line.strip()
+                        if not rel:
+                            continue
+                        seq_dir = os.path.join(self.data_dir, rel)
+                        if os.path.isdir(seq_dir):
+                            self.seq_dirs.append(seq_dir)
+            else:
+                fps_root = os.path.join(cat_dir, 'lower_fps_frames')
+                if os.path.isdir(fps_root):
+                    for seq in sorted(os.listdir(fps_root)):
+                        seq_path = os.path.join(fps_root, seq)
+                        if os.path.isdir(seq_path):
+                            self.seq_dirs.append(seq_path)
+        if self.split == 'val':
+            self.seq_dirs = self.seq_dirs[:5]
+        if self.split == 'train':
+            self.seq_dirs *= 10
+        assert self.seq_dirs, \
+            f"No sequences found for split '{self.split}' in {self.data_dir}"
+    def __len__(self):
+        return len(self.test_intervals) if self.split == 'test' else len(self.seq_dirs)
+    def __getitem__(self, idx):
+        # Prepare base items
+        if self.split == 'test':
+            interval = self.test_intervals[idx]
+            category, seq = interval['video_name'].split('/')
+            seq_dir = os.path.join(self.data_dir, category, 'lower_fps_frames', seq)
+            frame_paths = sorted(glob.glob(os.path.join(seq_dir, '*.png')))
+            in_start  = interval['in_start']
+            in_end    = interval['in_end']
+            out_start = interval['out_start']
+            out_end   = interval['out_end']
+            center = interval['center']
+            window = interval['window_size']
+            mode   = interval['mode']
+            fps    = interval['fps']
+            # generate test case
+            blur_img, seq_frames, inp_int, out_int, high_fps_video, num_frames = generate_test_case(
+                frame_paths=frame_paths, window_max=window, in_start=in_start, in_end=in_end, out_start=out_start,out_end=out_end, center=center, mode=mode, fps=fps
+            )
+            file_name = frame_paths[center]
+        else:
+            seq_dir = self.seq_dirs[idx]
+            frame_paths = sorted(glob.glob(os.path.join(seq_dir, '*.png')))
+            mode = random.choice(['1x', '2x', 'large_blur'])
+            if mode == '1x' or len(frame_paths) < 50:
+                base_rate = random.choice([1, 2])
+                blur_img, seq_frames, inp_int, out_int, _ = generate_1x_sequence(
+                    frame_paths, window_max=16, output_len=17, base_rate=base_rate
+                )
+            elif mode == '2x':
+                base_rate = random.choice([1, 2])
+                blur_img, seq_frames, inp_int, out_int, _ = generate_2x_sequence(
+                    frame_paths, window_max=16, output_len=17, base_rate=base_rate
+                )
+            else:
+                max_base = min((len(frame_paths) - 1) // 17, 3)
+                base_rate = random.randint(1, max_base)
+                blur_img, seq_frames, inp_int, out_int, _ = generate_large_blur_sequence(
+                    frame_paths, window_max=16, output_len=17, base_rate=base_rate
+                )
+            file_name = frame_paths[0]
+            num_frames = 16
+        # blur_img is a single frame; wrap in batch dim
+        blur_input = self.load_frames(np.expand_dims(blur_img, 0))
+        # seq_frames is list of frames; stack along time dim
+        video = self.load_frames(np.stack(seq_frames, axis=0))
+        relative_file_name = os.path.relpath(file_name, self.data_dir)
+        if self.split == 'test':
+            # Get base directory and frame prefix
+            base_dir = os.path.dirname(relative_file_name)
+            frame_stem = os.path.splitext(os.path.basename(relative_file_name))[0]  # "frame_00000"
+            # Build new filename
+            new_filename = (
+                f"{frame_stem}_"
+                f"in{in_start:04d}_ie{in_end:04d}_"
+                f"os{out_start:04d}_oe{out_end:04d}_"
+                f"ctr{center:04d}_win{window:04d}_"
+                f"fps{fps:04d}_{mode}.png"
+            )
+            # Final path
+            relative_file_name = os.path.join(base_dir, new_filename)
+        data = {
+            'file_name': relative_file_name,
+            'blur_img':  blur_input,
+            'num_frames': num_frames,
+            'video':     video,
+            'caption':   "",
+            'mode':      mode,
+            'input_interval':  inp_int,
+            'output_interval': out_int,
+        }
+        if self.split == 'test':
+            high_fps_video = self.load_frames(np.stack(high_fps_video, axis=0))
+            data['high_fps_video'] = high_fps_video
+        return data
+class GoPro2xMotionBlurDataset(BaseClass):
+    def __init__(self,
+                 *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        # Set blur and sharp directories based on split
+        if self.split == 'train':
+            self.blur_root = os.path.join(self.data_dir, 'train', 'blur')
+            self.sharp_root = os.path.join(self.data_dir, 'train', 'sharp')
+        elif self.split in ['val', 'test']:
+            self.blur_root = os.path.join(self.data_dir, 'test', 'blur')
+            self.sharp_root = os.path.join(self.data_dir, 'test', 'sharp')
+        else:
+            raise ValueError(f"Unsupported split: {self.split}")
+        # Collect all blurred image paths
+        pattern = os.path.join(self.blur_root, '*', '*.png')
+        def get_sharp_paths(blur_paths):
+            sharp_paths = []
+            for blur_path in blur_paths:
+                base_dir = blur_path.replace('/blur/', '/sharp/')
+                frame_num = int(os.path.basename(blur_path).split('.')[0])
+                dir_path = os.path.dirname(base_dir)
+                sequence = [
+                    os.path.join(dir_path, f"{frame_num + offset:06d}.png")
+                    for offset in range(-6, 7)
+                ]
+                if all(os.path.exists(path) for path in sequence):
+                    sharp_paths.append(sequence)
+            return sharp_paths
+        self.blur_paths = sorted(glob.glob(pattern))
+        filtered_blur_paths = []
+        for path in self.blur_paths:
+            output_deblurred_dir = os.path.join(self.output_dir, "deblurred")
+            full_output_path = Path(output_deblurred_dir, *path.split('/')[-2:]).with_suffix(".mp4")
+            if not os.path.exists(full_output_path):
+                filtered_blur_paths.append(path)
+        self.blur_paths = filtered_blur_paths
+        self.sharp_paths = get_sharp_paths(self.blur_paths)
+        if self.split == 'val':
+            # Optional: limit validation subset
+            self.sharp_paths = self.sharp_paths[:5]
+        self.length = len(self.sharp_paths)
+    def __len__(self):
+        return self.length
+    def __getitem__(self, idx):
+        # Path to the blurred (center) frame
+        sharp_path = self.sharp_paths[idx]
+        # Load sharp frames
+        blur_img, seq_frames, inp_int, out_int, high_fps_video, num_frames = generate_test_case(
+                        frame_paths=sharp_path, window_max=13, in_start=3, in_end=10, out_start=0,out_end=13, center=6, mode="2x", fps=240
+                    )
+        # Convert to pixel values via BaseClass loader
+        video = self.load_frames(np.array(seq_frames))                    # shape: (output_length, H, W, C)
+        blur_input = self.load_frames(np.expand_dims(np.array(blur_img), 0))  # shape: (1, H, W, C)
+        last_two_parts_of_path = os.path.join(*sharp_path[6].split(os.sep)[-2:])
+        #print(f"Time taken to load and process data: {end_time - start_time:.2f} seconds")
+        data = {
+            'file_name': last_two_parts_of_path,
+            'blur_img': blur_input,
+            'video': video,
+            "caption": "",
+            'input_interval': inp_int,
+            'output_interval': out_int,
+            "num_frames": num_frames,
+            "mode": "2x",
+        }
+        return data
+class BAISTDataset(BaseClass):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        test_folders =  {
+            "gWA_sBM_c01_d26_mWA0_ch06_cropped_32X": None,
+            "gBR_sBM_c01_d05_mBR0_ch01_cropped_32X": None,
+            "gMH_sBM_c01_d22_mMH0_ch04_cropped_32X": None,
+            "gHO_sBM_c01_d20_mHO0_ch05_cropped_32X": None,
+            "gMH_sBM_c01_d22_mMH0_ch08_cropped_32X": None,
+            "gWA_sBM_c01_d26_mWA0_ch02_cropped_32X": None,
+            "gJS_sBM_c01_d02_mJS0_ch08_cropped_32X": None,
+            "gHO_sBM_c01_d20_mHO0_ch07_cropped_32X": None,
+            "gHO_sBM_c01_d20_mHO0_ch06_cropped_32X": None,
+            "gBR_sBM_c01_d05_mBR0_ch03_cropped_32X": None,
+            "gBR_sBM_c01_d05_mBR0_ch05_cropped_32X": None,
+            "gHO_sBM_c01_d20_mHO0_ch02_cropped_32X": None,
+            "gHO_sBM_c01_d20_mHO0_ch03_cropped_32X": None,
+            "gHO_sBM_c01_d20_mHO0_ch09_cropped_32X": None,
+            "gMH_sBM_c01_d22_mMH0_ch10_cropped_32X": None,
+            "gWA_sBM_c01_d26_mWA0_ch10_cropped_32X": None,
+            "gBR_sBM_c01_d05_mBR0_ch06_cropped_32X": None,
+            "gHO_sBM_c01_d20_mHO0_ch08_cropped_32X": None,
+            "gMH_sBM_c01_d22_mMH0_ch06_cropped_32X": None,
+            "gHO_sBM_c01_d20_mHO0_ch10_cropped_32X": None,
+            "gMH_sBM_c01_d22_mMH0_ch09_cropped_32X": None,
+            "gMH_sBM_c01_d22_mMH0_ch02_cropped_32X": None,
+            "gBR_sBM_c01_d05_mBR0_ch04_cropped_32X": None,
+            "gPO_sBM_c01_d10_mPO0_ch09_cropped_32X": None,
+            "gMH_sBM_c01_d22_mMH0_ch01_cropped_32X": None,
+            "gMH_sBM_c01_d22_mMH0_ch07_cropped_32X": None,
+            "gMH_sBM_c01_d22_mMH0_ch03_cropped_32X": None,
+            "gHO_sBM_c01_d20_mHO0_ch04_cropped_32X": None,
+            "gBR_sBM_c01_d05_mBR0_ch02_cropped_32X": None,
+            "gHO_sBM_c01_d20_mHO0_ch01_cropped_32X": None,
+            "gMH_sBM_c01_d22_mMH0_ch05_cropped_32X": None,
+            "gPO_sBM_c01_d10_mPO0_ch10_cropped_32X": None,
+        }
+        def collect_blur_images(root_dir, allowed_folders, skip_start=40, skip_end=40):
+            blur_image_paths = []
+            for dirpath, dirnames, filenames in os.walk(root_dir):
+                if os.path.basename(dirpath) == "blur":
+                    parent_folder = os.path.basename(os.path.dirname(dirpath))
+                    if (self.split in ["test", "val"] and parent_folder in test_folders) or (self.split in "train" and parent_folder not in test_folders):
+                        # Filter and sort valid image filenames
+                        valid_files = [
+                            f for f in filenames
+                            if f.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.tiff')) and os.path.splitext(f)[0].isdigit()
+                        ]
+                        valid_files.sort(key=lambda x: int(os.path.splitext(x)[0]))
+                        # Skip first and last N files
+                        middle_files = valid_files[skip_start:len(valid_files) - skip_end]
+                        for f in middle_files:
+                            from pathlib import Path
+                            full_path = Path(os.path.join(dirpath, f))
+                            output_deblurred_dir = os.path.join(self.output_dir, "deblurred")
+                            full_output_path = Path(output_deblurred_dir, *full_path.parts[-3:]).with_suffix(".mp4")
+                            if not os.path.exists(full_output_path) or self.split in ["train", "val"]:
+                                blur_image_paths.append(os.path.join(dirpath, f))
+            return blur_image_paths
+        self.image_paths = collect_blur_images(self.data_dir, test_folders)
+        #if bbx path does not exist, remove the image path
+        self.image_paths = [path for path in self.image_paths if os.path.exists(path.replace("blur", "blur_anno").replace(".png", ".pkl"))]
+        filtered_image_paths = []
+        for blur_path in self.image_paths:
+            base_dir = blur_path.replace('/blur/', '/sharp/').replace('.png', '')
+            sharp_paths = [f"{base_dir}_{i:03d}.png" for i in range(7)]
+            if all(os.path.exists(p) for p in sharp_paths):
+                filtered_image_paths.append(blur_path)
+        self.image_paths = filtered_image_paths
+        if self.split == 'val':
+            # Optional: limit validation subset
+            self.image_paths = self.image_paths[:4]
+        self.length = len(self.image_paths)
+    def __len__(self):
+        return self.length
+    def __getitem__(self, idx):
+        image_path = self.image_paths[idx]
+        blur_img_original = load_as_srgb(image_path)
+        bbx_path = image_path.replace("blur", "blur_anno").replace(".png", ".pkl")
+        #load the bbx path
+        bbx = np.load(bbx_path, allow_pickle=True)['bbox'][0:4]
+        # Final crop box
+        #turn crop_box into tupel
+        W,H = blur_img_original.size
+        blur_img = blur_img_original.resize((self.image_size[1], self.image_size[0]), resample=Image.BILINEAR)
+        #cause pil is width, height
+        blur_np = np.array([blur_img])
+        base_dir = os.path.dirname(os.path.dirname(image_path))  # strip /blur
+        filename = os.path.splitext(os.path.basename(image_path))[0]  # '00000000'
+        sharp_dir = os.path.join(base_dir, "sharp")
+        frame_paths = [
+            os.path.join(sharp_dir, f"{filename}_{i:03d}.png")
+            for i in range(7)
+        ]
+        _, seq_frames, inp_int, out_int, high_fps_video, num_frames = generate_test_case(
+                        frame_paths=frame_paths, window_max=7, in_start=0, in_end=7, out_start=0,out_end=7, center=3, mode="1x", fps=240
+                    )
+        pixel_values = self.load_frames(np.stack(seq_frames, axis=0))
+        blur_pixel_values = self.load_frames(blur_np)
+        relative_file_name = os.path.relpath(image_path, self.data_dir)
+        out_bbx = bbx.copy()
+        scale_x = blur_pixel_values.shape[3]/W
+        scale_y = blur_pixel_values.shape[2]/H
+        #scale the bbx
+        out_bbx[0] = int(out_bbx[0] * scale_x)
+        out_bbx[1] = int(out_bbx[1] * scale_y)
+        out_bbx[2] = int(out_bbx[2] * scale_x)
+        out_bbx[3] = int(out_bbx[3] * scale_y)
+        out_bbx = torch.tensor(out_bbx, dtype=torch.uint32)
+        #crop image using the bbx
+        blur_img_npy = np.array(blur_img)
+        out_bbx_npy = out_bbx.numpy().astype(np.uint32)
+        blur_img_npy = blur_img_npy[out_bbx_npy[1]:out_bbx_npy[3], out_bbx_npy[0]:out_bbx_npy[2], :]
+        data = {
+            'file_name': relative_file_name,
+            'blur_img': blur_pixel_values,
+            'video': pixel_values,
+            'bbx': out_bbx,
+            'caption': "",
+            'input_interval': inp_int,
+            'output_interval': out_int,
+            "num_frames": num_frames,
+            'mode':  "1x",
+        }
+        return data

training/helpers.py ADDED Viewed

	@@ -0,0 +1,533 @@

+import torch
+import math
+import random
+import numpy as np
+from PIL import Image
+def random_insert_latent_frame(
+    image_latent: torch.Tensor,
+    noisy_model_input: torch.Tensor,
+    target_latents: torch.Tensor,
+    input_intervals: torch.Tensor,
+    output_intervals: torch.Tensor,
+    special_info
+):
+    """
+    Inserts latent frames into noisy input, pads targets, and builds flattened intervals with flags.
+    Args:
+        image_latent:     [B, latent_count, C, H, W]
+        noisy_model_input:[B, F, C, H, W]
+        target_latents:   [B, F, C, H, W]
+        input_intervals:  [B, N, frames_per_latent, L]
+        output_intervals: [B, M, frames_per_latent, L]
+    For each sample randomly choose:
+    Mode A (50%):
+        - Insert two image_latent frames at start of noisy input and targets.
+        - Pad target_latents by prepending two zero-frames.
+        - Pad input_intervals by repeating its last group once.
+    Mode B (50%):
+        - Insert one image_latent frame at start and repeat last noisy frame at end.
+        - Pad target_latents by prepending one one-frame and appending last target frame.
+        - Pad output_intervals by repeating its last group once.
+    After padding intervals, flatten each group from [frames_per_latent, L] to [frames_per_latent * L],
+    then append a 4-element flag (1 for input groups, 0 for output groups).
+    Returns:
+        outputs:     Tensor [B, F+2, C, H, W]
+        new_targets: Tensor [B, F+2, C, H, W]
+        masks:       Tensor [B, F+2] bool mask of latent inserts
+        intervals:   Tensor [B, N+M+1, fpl * L + 4]
+    """
+    B, F, C, H, W = noisy_model_input.shape
+    _, N, fpl, L = input_intervals.shape
+    _, M, _, _ = output_intervals.shape
+    device = noisy_model_input.device
+    new_F = F + 1 if special_info == "just_one" else F + 2
+    outputs = torch.empty((B, new_F, C, H, W), device=device)
+    masks = torch.zeros((B, new_F), dtype=torch.bool, device=device)
+    combined_groups = N + M #+ 1
+    feature_len = fpl * L
+    # intervals = torch.empty((B, combined_groups, feature_len + 4), device=device,
+    #                         dtype=input_intervals.dtype)
+    intervals = torch.empty((B, combined_groups, feature_len), device=device,
+                            dtype=input_intervals.dtype)
+    new_targets = torch.empty((B, new_F, C, H, W), device=device,
+                            dtype=target_latents.dtype)
+    for b in range(B):
+        latent = image_latent[b, 0]
+        frames = noisy_model_input[b]
+        tgt = target_latents[b]
+        limit = 10 if special_info == "use_a" else 0.5
+        if special_info == "just_one": #ALWAYS_MODE_A
+            # Mode A: two latent inserts, zero-prefixed targets
+            outputs[b, 0] = latent
+            masks[b, :1] = True
+            outputs[b, 1:] = frames
+            # pad targets: two large-numbers - these should be ignored
+            large_number = torch.ones_like(tgt[0])*10000
+            new_targets[b, 0] = large_number
+            new_targets[b, 1:] = tgt
+            # pad intervals: input + replicated last input group
+            #pad_group = input_intervals[b, -1:].clone()
+            in_groups = input_intervals[b] #torch.cat([input_intervals[b], pad_group], dim=0)
+            out_groups = output_intervals[b]
+        elif random.random() < limit: #ALWAYS_MODE_A
+            # Mode A: two latent inserts, zero-prefixed targets
+            outputs[b, 0] = latent
+            outputs[b, 1] = latent
+            masks[b, :2] = True
+            outputs[b, 2:] = frames
+            # pad targets: two large-numbers - these should be ignored
+            large_number = torch.ones_like(tgt[0])*10000
+            new_targets[b, 0] = large_number
+            new_targets[b, 1] = large_number
+            new_targets[b, 2:] = tgt
+            # pad intervals: input + replicated last input group
+            pad_group = input_intervals[b, -1:].clone()
+            in_groups = torch.cat([input_intervals[b], pad_group], dim=0)
+            out_groups = output_intervals[b]
+        else:
+            # Mode B: one latent insert & last-frame repeat, one-prefixed/appended targets
+            outputs[b, 0] = latent
+            masks[b, 0] = True
+            outputs[b, 1:new_F-1] = frames
+            outputs[b, new_F-1] = frames[-1]
+            # pad targets: one one-frame then original then last frame
+            zero = torch.zeros_like(tgt[0])
+            new_targets[b, 0] = zero
+            new_targets[b, 1:new_F-1] = tgt
+            new_targets[b, new_F-1] = tgt[-1]
+            # pad intervals: output + replicated last output group
+            in_groups = input_intervals[b]
+            pad_group = output_intervals[b, -1:].clone()
+            out_groups = torch.cat([output_intervals[b], pad_group], dim=0)
+        # flatten & flag groups
+        flat_in = in_groups.reshape(-1, feature_len)
+        proc_in = torch.cat([flat_in], dim=1)
+        flat_out = out_groups.reshape(-1, feature_len)
+        proc_out = torch.cat([flat_out], dim=1)
+        intervals[b] = torch.cat([proc_in, proc_out], dim=0)
+    return outputs, new_targets, masks, intervals
+def transform_intervals(
+    intervals: torch.Tensor,
+    frames_per_latent: int = 4,
+    repeat_first: bool = True
+) -> torch.Tensor:
+    """
+    Pad and reshape intervals into [B, num_latent_frames, frames_per_latent, L].
+    Args:
+        intervals: Tensor of shape [B, N, L]
+        frames_per_latent: number of frames per latent group (e.g., 4)
+        repeat_first: if True, pad at the beginning by repeating the first row; otherwise pad at the end by repeating the last row.
+    Returns:
+        Tensor of shape [B, num_latent_frames, frames_per_latent, L]
+    """
+    B, N, L = intervals.shape
+    num_latent = math.ceil(N / frames_per_latent)
+    target_N = num_latent * frames_per_latent
+    pad_count = target_N - N
+    if pad_count > 0:
+        # choose row to repeat
+        pad_row = intervals[:, :1, :] if repeat_first else intervals[:, -1:, :]
+        # replicate pad_row pad_count times
+        pad = pad_row.repeat(1, pad_count, 1)
+        # pad at beginning or end
+        if repeat_first:
+            expanded = torch.cat([pad, intervals], dim=1)
+        else:
+            expanded = torch.cat([intervals, pad], dim=1)
+    else:
+        expanded = intervals[:, :target_N, :]
+    # reshape into latent-frame groups
+    return expanded.view(B, num_latent, frames_per_latent, L)
+import random
+import numpy as np
+import torch
+from PIL import Image
+import random
+import numpy as np
+import torch
+from PIL import Image
+def build_blur(frame_paths, gamma=2.2):
+    """
+    Simulate motion blur using inverse-gamma (linear-light) summation:
+    - Load each image, convert to float32 sRGB [0,255]
+    - Linearize via inverse gamma: linear = (img/255)^gamma
+    - Sum linear values, average, then re-encode via gamma: (linear_avg)^(1/gamma)*255
+    Returns a uint8 numpy array.
+    """
+    acc_lin = None
+    for p in frame_paths:
+        img = np.array(Image.open(p).convert('RGB'), dtype=np.float32)
+        # normalize to [0,1] then linearize
+        lin = np.power(img / 255.0, gamma)
+        acc_lin = lin if acc_lin is None else acc_lin + lin
+    # average in linear domain
+    avg_lin = acc_lin / len(frame_paths)
+    # gamma-encode back to sRGB domain
+    srgb = np.power(avg_lin, 1.0 / gamma) * 255.0
+    return np.clip(srgb, 0, 255).astype(np.uint8)
+def generate_1x_sequence(frame_paths, window_max =16, output_len=17, base_rate=1, start = None):
+    """
+    1× mode at arbitrary base_rate (units of 1/240s):
+      - Treat each output step as the sum of `base_rate` consecutive raw frames.
+      - Pick window size W ∈ [1, output_len]
+      - Randomly choose start index so W*base_rate frames fit
+      - Group raw frames into W groups of length base_rate
+      - Build blur image over all W*base_rate frames for input
+      - For each group, build a blurred output frame by summing its base_rate frames
+      - Pad sequence of W blurred frames to output_len by repeating last blurred frame
+      - Input interval always [-0.5, 0.5]
+      - Output intervals reflect each group’s coverage within [-0.5,0.5]
+    """
+    N = len(frame_paths)
+    max_w = min(output_len, N // base_rate)
+    max_w = min(max_w, window_max)
+    W = random.randint(1, max_w)
+    if start is not None:
+        # choose start so that W*base_rate frames fit
+        assert N >= W * base_rate, f"Not enough frames for base_rate={base_rate}, need {W * base_rate}, got {N}"
+    else:
+        start = random.randint(0, N - W * base_rate)
+    # group start indices
+    group_starts = [start + i * base_rate for i in range(W)]
+    # flatten raw frame paths for blur input
+    blur_paths = []
+    for gs in group_starts:
+        blur_paths.extend(frame_paths[gs:gs + base_rate])
+    blur_img = build_blur(blur_paths)
+    # build blurred output frames per group
+    seq = []
+    for gs in group_starts:
+        group = frame_paths[gs:gs + base_rate]
+        seq.append(build_blur(group))
+    # pad with last blurred frame
+    seq += [seq[-1]] * (output_len - len(seq))
+    input_interval = torch.tensor([[-0.5, 0.5]], dtype=torch.float)
+    # each group covers interval of length 1/W
+    step = 1.0 / W
+    intervals = [[-0.5 + i * step, -0.5 + (i + 1) * step] for i in range(W)]
+    num_frames = len(intervals)
+    intervals += [intervals[-1]] * (output_len - W)
+    output_intervals = torch.tensor(intervals, dtype=torch.float)
+    return blur_img, seq, input_interval, output_intervals, num_frames
+def generate_2x_sequence(frame_paths, window_max =16, output_len=17, base_rate=1):
+    """
+    2× mode:
+      - Logical window of W output-steps so that 2*W ≤ output_len
+      - Raw window spans W*base_rate frames
+      - Build blur only over that raw window (flattened) for input
+      - before_count = W//2, after_count = W - before_count
+      - Define groups for before, during, and after each of length base_rate
+      - Build blurred frames for each group
+      - Pad sequence of 2*W blurred frames to output_len by repeating last
+      - Input interval always [-0.5,0.5]
+      - Output intervals relative to window: each group’s center
+    """
+    N = len(frame_paths)
+    max_w = min(output_len // 2, N // base_rate)
+    max_w = min(max_w, window_max)
+    W = random.randint(1, max_w)
+    before_count = W // 2
+    after_count = W - before_count
+    # choose start so that before and after stay within bounds
+    min_start = before_count * base_rate
+    max_start = N - (W + after_count) * base_rate
+    # ensure we can pick a valid start, else fail
+    assert max_start >= min_start, f"Cannot satisfy before/after window for W={W}, base_rate={base_rate}, N={N}"
+    start = random.randint(min_start, max_start)
+    # window group starts
+    window_starts = [start + i * base_rate for i in range(W)]
+    # flatten for blur input
+    blur_paths = []
+    for gs in window_starts:
+        blur_paths.extend(frame_paths[gs:gs + base_rate])
+    blur_img = build_blur(blur_paths)
+    # define before/after group starts
+    before_count = W // 2
+    after_count = W - before_count
+    before_starts = [max(0, start - (i + 1) * base_rate) for i in range(before_count)][::-1]
+    after_starts  = [min(N - base_rate, start + W * base_rate + i * base_rate) for i in range(after_count)]
+    # all group starts in sequence
+    group_starts = before_starts + window_starts + after_starts
+    # build blurred frames per group
+    seq = []
+    for gs in group_starts:
+        group = frame_paths[gs:gs + base_rate]
+        seq.append(build_blur(group))
+    # pad blurred frames to output_len
+    seq += [seq[-1]] * (output_len - len(seq))
+    input_interval = torch.tensor([[-0.5, 0.5]], dtype=torch.float)
+    # each group covers 1/(2W) around its center within [-0.5,0.5]
+    half = 0.5 / W
+    centers = [((gs - start) / (W * base_rate)) - 0.5 + half
+               for gs in group_starts]
+    intervals = [[c - half, c + half] for c in centers]
+    num_frames = len(intervals)
+    intervals += [intervals[-1]] * (output_len - len(intervals))
+    output_intervals = torch.tensor(intervals, dtype=torch.float)
+    return blur_img, seq, input_interval, output_intervals, num_frames
+def generate_large_blur_sequence(frame_paths, window_max=16, output_len=17, base_rate=1):
+    """
+    Large blur mode (fixed output_len=25) with instantaneous outputs:
+      - Raw window spans 25 * base_rate consecutive frames
+      - Build blur over that full raw window for input
+      - For output sequence:
+          • Pick 1 raw frame every `base_rate` (group_starts)
+          • Each output frame is the instantaneous frame at that raw index
+      - Input interval always [-0.5, 0.5]
+      - Output intervals reflect each 1-frame slice’s coverage within the blur window,
+        leaving gaps between.
+    """
+    N = len(frame_paths)
+    total_raw = window_max * base_rate
+    assert N >= total_raw, f"Not enough frames for base_rate={base_rate}, need {total_raw}, got {N}"
+    start = random.randint(0, N - total_raw)
+    # build blur input over the full raw block
+    raw_block = frame_paths[start:start + total_raw]
+    blur_img = build_blur(raw_block)
+    # output sequence: instantaneous frames at each group_start
+    seq = []
+    group_starts = [start + i * base_rate for i in range(window_max)]
+    for gs in group_starts:
+        img = np.array(Image.open(frame_paths[gs]).convert('RGB'), dtype=np.uint8)
+        seq.append(img)
+     # pad blurred frames to output_len
+    seq += [seq[-1]] * (output_len - len(seq))
+    # compute intervals for each instantaneous frame:
+    # each covers [gs, gs+1) over total_raw, normalized to [-0.5, 0.5]
+    intervals = []
+    for gs in group_starts:
+        t0 = (gs - start) / total_raw - 0.5
+        t1 = (gs + 1 - start) / total_raw - 0.5
+        intervals.append([t0, t1])
+    num_frames = len(intervals)
+    intervals += [intervals[-1]] * (output_len - len(intervals))
+    output_intervals = torch.tensor(intervals, dtype=torch.float)
+    # input interval
+    input_interval = torch.tensor([[-0.5, 0.5]], dtype=torch.float)
+    return blur_img, seq, input_interval, output_intervals, num_frames
+def generate_test_case(frame_paths,
+                       window_max=16,
+                       output_len=17,
+                       in_start=None,
+                       in_end=None,
+                       out_start=None,
+                       out_end = None,
+                       center=None,
+                       mode="1x",
+                       fps=240):
+    """
+    Generate blurred input + a target sequence + normalized intervals.
+    Args:
+        frame_paths: list of all frame filepaths
+        window_max: number of groups/bins W
+        output_len: desired length of the output sequence
+        in_start, in_end: integer indices defining the raw window [in_start, in_end)
+        mode: one of "1x", "2x", or "lb"
+        fps: frames-per-second (only used to override mode=="2x" if fps==120)
+    Returns:
+        blur_img: np.ndarray of the global blur over the window
+        seq: list of np.ndarray, length = output_len (blured groups or raw frames)
+        input_interval: torch.Tensor [[-0.5, 0.5]]
+        output_intervals: torch.Tensor shape [output_len, 2], normalized in [-0.5,0.5]
+    """
+    # 1) slice and blur
+    raw_paths = frame_paths[in_start:in_end]
+    blur_img = build_blur(raw_paths)
+    # 2) build the sequence
+    # one target per frame
+    seq = [
+        np.array(Image.open(p).convert("RGB"), dtype=np.uint8)
+        for p in frame_paths[out_start:out_end]
+    ]
+    # 3) compute normalized intervals
+    input_interval = torch.tensor([[-0.5, 0.5]], dtype=torch.float)
+    # 2) define the normalizer
+    def normalize(x, in_start, in_end):
+        return (x - in_start) / (in_end - in_start) - 0.5
+    base_rate = 240 // fps
+    # 3) define the raw intervals in absolute frame‐indices
+    base_rate = 240 // fps
+    if mode == "1x":
+        assert in_start == out_start and in_end == out_end
+        #assert fps == 240, "haven't implemented 120fps in 1x yet"
+        W = (out_end - out_start) // base_rate
+        # one frame per window
+        group_starts = [out_start + i * base_rate for i in range(W)]
+        group_ends   = [out_start + (i + 1) * base_rate for i in range(W)]
+    elif mode == "2x":
+        W = (out_end - out_start) // base_rate
+        # every base_rate frames, starting at out_start
+        group_starts = [out_start + i * base_rate for i in range(W)]
+        group_ends   = [out_start + (i + 1) * base_rate for i in range(W)]
+    elif mode == "lb":
+        W = (out_end - out_start) // base_rate
+        # sparse “key‐frame” windows from the raw input range
+        group_starts = [in_start + i * base_rate for i in range(W)]
+        group_ends   = [s + 1 for s in group_starts]
+    else:
+        raise ValueError(f"Unsupported mode: {mode}")
+    # --- after mode‐switch, once you have raw group_starts & group_ends ---
+    # 4) build a summed video sequence by blurring each interval
+    summed_seq = []
+    for s, e in zip(group_starts, group_ends):
+        # make sure indices lie in [in_start, in_end)
+        s_clamped = max(in_start, min(s, in_end-1))
+        e_clamped = max(s_clamped+1,    min(e, in_end))
+        # sum/blur the frames in [s_clamped:e_clamped)
+        summed = build_blur(frame_paths[s_clamped:e_clamped])
+        summed_seq.append(summed)
+    # pad to output_len
+    if len(summed_seq) < output_len:
+        summed_seq += [summed_seq[-1]] * (output_len - len(summed_seq))
+    # 5) now normalize your intervals as before
+    def normalize(x):
+        return (x - in_start) / (in_end - in_start) - 0.5
+    intervals = [[normalize(s), normalize(e)] for s, e in zip(group_starts, group_ends)]
+    num_frames = len(intervals)
+    if len(intervals) < output_len:
+        intervals += [intervals[-1]] * (output_len - len(intervals))
+    output_intervals = torch.tensor(intervals, dtype=torch.float)
+    # final return now also includes summed_seq
+    return blur_img, summed_seq, input_interval, output_intervals, seq, num_frames
+def get_conditioning(
+    output_len=17,
+    in_start=None,
+    in_end=None,
+    out_start=None,
+    out_end=None,
+    mode="1x",
+    fps=240,
+):
+    """
+    Generate normalized intervals conditioning singals. Just like the above function but without
+    loading any images (for inference only).
+    Args:
+        output_len: desired length of the output sequence
+        in_start, in_end: integer indices defining the raw window [in_start, in_end)
+        mode: one of "1x", "2x", or "lb"
+        fps: frames-per-second (only used to override mode=="2x" if fps==120)
+    Returns:
+        input_interval: torch.Tensor [[-0.5, 0.5]]
+        output_intervals: torch.Tensor shape [output_len, 2], normalized in [-0.5,0.5]
+    """
+    # 3) compute normalized intervals
+    input_interval = torch.tensor([[-0.5, 0.5]], dtype=torch.float)
+    # 2) define the normalizer
+    def normalize(x, in_start, in_end):
+        return (x - in_start) / (in_end - in_start) - 0.5
+    base_rate = 240 // fps
+    # 3) define the raw intervals in absolute frame‐indices
+    base_rate = 240 // fps
+    if mode == "1x":
+        assert in_start == out_start and in_end == out_end
+        #assert fps == 240, "haven't implemented 120fps in 1x yet"
+        W = (out_end - out_start) // base_rate
+        # one frame per window
+        group_starts = [out_start + i * base_rate for i in range(W)]
+        group_ends   = [out_start + (i + 1) * base_rate for i in range(W)]
+    elif mode == "2x":
+        W = (out_end - out_start) // base_rate
+        # every base_rate frames, starting at out_start
+        group_starts = [out_start + i * base_rate for i in range(W)]
+        group_ends   = [out_start + (i + 1) * base_rate for i in range(W)]
+    elif mode == "lb":
+        W = (out_end - out_start) // base_rate
+        # sparse “key‐frame” windows from the raw input range
+        group_starts = [in_start + i * base_rate for i in range(W)]
+        group_ends   = [s + 1 for s in group_starts]
+    else:
+        raise ValueError(f"Unsupported mode: {mode}")
+    # 5) now normalize your intervals as before
+    def normalize(x):
+        return (x - in_start) / (in_end - in_start) - 0.5
+    intervals = [[normalize(s), normalize(e)] for s, e in zip(group_starts, group_ends)]
+    num_frames = len(intervals)
+    if len(intervals) < output_len:
+        intervals += [intervals[-1]] * (output_len - len(intervals))
+    output_intervals = torch.tensor(intervals, dtype=torch.float)
+    return input_interval, output_intervals, num_frames

training/slurm_scripts/simple_multinode.sbatch ADDED Viewed

	@@ -0,0 +1,88 @@

+#!/bin/bash
+#SBATCH --job-name=XYZ
+#SBATCH --nodes=4
+#SBATCH --mem=256gb
+#SBATCH --ntasks-per-node=1          # crucial - only 1 task per dist per node!
+#SBATCH --cpus-per-task=28
+#SBATCH --gpus-per-node=4
+#SBATCH --exclusive
+#SBATCH --output=output/slurm-%j-%N.out
+#SBATCH --error=error/slurm-%j-%N.err
+#SBATCH --qos=scavenger
+#SBATCH --signal=B:USR1@300
+#SBATCH --nodelist=lse-hpcnode[1,3,4,5,10-12]
+#6 and 9 are messed up
+#7 is sketchy as well
+set -x -e
+if [ -z "$1" ]
+ then
+  #quit if no job number is passed
+  echo "No config file passed, quitting"
+  exit 1
+ else
+  config_file=$1
+fi
+source ~/.bashrc
+conda activate gencam
+cd /datasets/sai/gencam/cogvideox/training
+echo "START TIME: $(date)"
+# needed until we fix IB issues
+export NCCL_IB_DISABLE=1
+export NCCL_SOCKET_IFNAME=ens
+# Training setup
+GPUS_PER_NODE=4
+# so processes know who to talk to
+MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
+MASTER_PORT=6000
+NNODES=$SLURM_NNODES
+NODE_RANK=$SLURM_PROCID
+WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+#CMD="accelerate_test.py"
+CMD="train_controlnet.py --config $config_file"
+LAUNCHER="accelerate launch \
+  --multi_gpu \
+  --gpu_ids 0,1,2,3 \
+  --num_processes $WORLD_SIZE \
+  --num_machines $NNODES \
+  --main_process_ip $MASTER_ADDR \
+  --main_process_port $MASTER_PORT \
+  --rdzv_backend=c10d \
+  --max_restarts 0 \
+  --tee 3 \
+"
+# # NOT SURE THE FOLLOWING ENV VARS IS STRICTLY NEEDED (PROBABLY NOT)
+# export CUDA_HOME=/usr/local/cuda-11.6
+# export LD_PRELOAD=$CUDA_HOME/lib/libnccl.so
+# export LD_LIBRARY_PATH=$CUDA_HOME/efa/lib:$CUDA_HOME/lib:$CUDA_HOME/lib64:$LD_LIBRARY_PATH
+SRUN_ARGS=" \
+    --wait=60 \
+    --kill-on-bad-exit=1 \
+    "
+handler()
+{
+  echo "Signal handler triggered at $(date)"
+  sleep 120  # Let training save
+  sbatch ${BASH_SOURCE[0]} $config_file
+}
+# register signal handler
+trap handler SIGUSR1
+clear; srun --cpu-bind=none --jobid $SLURM_JOB_ID $LAUNCHER $CMD & srun_pid=$!
+wait
+echo "END TIME: $(date)"

training/slurm_scripts/slurm-bash.sh ADDED Viewed

	@@ -0,0 +1 @@


1	+ srun --nodes=1 --gpus=4 --qos=gpu4-8h --pty bash

training/slurm_scripts/train.sbatch ADDED Viewed

	@@ -0,0 +1,54 @@

+#!/bin/bash
+#SBATCH --job-name=train_deblur
+#SBATCH --nodes=1
+#SBATCH --gpus-per-node=4
+#SBATCH --qos=gpu4-8h
+#SBATCH --signal=B:USR1@600
+#SBATCH --cpus-per-task=24
+#SBATCH --output=output/slurm-%j.out
+#SBATCH --error=error/slurm-%j.err
+#SBATCH --nodelist=lse-hpcnode[8]
+#the signal time needs to be larger than the sleep in the handler function
+# prepare your environment here
+source ~/.bashrc
+conda activate gencam
+cd /datasets/sai/gencam/cogvideox/training
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+if [ -z "$1" ]
+ then
+  #quit if no job number is passed
+  echo "No config file passed, quitting"
+  exit 1
+ else
+  config_file=$1
+fi
+handler()
+{
+  echo "function handler called at $(date)"
+  # Send SIGUSR1 to the captured PID of the accelerate job
+  if [ -n "$accelerate_pid" ]; then
+    echo "Sending SIGUSR1 to accelerate PID: $accelerate_pid"
+    python_id=$(ps --ppid $accelerate_pid -o pid=)
+    kill -USR1 $python_id  # Send SIGUSR1 to the accelerate job
+    sleep 300 # Wait for 5 minutes
+  else
+    echo "No accelerate PID found"
+  fi
+  echo "Resubmitting job with config file: $config_file"
+  sbatch ${BASH_SOURCE[0]} $config_file
+}
+# register signal handler
+trap handler SIGUSR1
+echo "Starting job at $(date)"
+#python train_controlnet.py #--config $config_file #& wait
+accelerate launch --config_file accelerator_configs/accelerator_train_config.yaml --multi_gpu train_controlnet.py --config $config_file &
+accelerate_pid=$!
+wait

training/slurm_scripts/val.sbatch ADDED Viewed

	@@ -0,0 +1,50 @@

+#!/bin/bash
+#SBATCH --job-name=train_deblur
+#SBATCH --nodes=1
+#SBATCH --gpus-per-node=4
+#SBATCH --qos=scavenger
+#SBATCH --signal=B:USR1@600
+#SBATCH --cpus-per-task=24
+#SBATCH --output=output/slurm-%j.out
+#SBATCH --error=error/slurm-%j.err
+#SBATCH --exclude=lse-hpcnode9
+# prepare your environment here
+source ~/.bashrc
+conda activate gencam
+cd /datasets/sai/gencam/cogvideox/training
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+if [ -z "$1" ]
+ then
+  #quit if no job number is passed
+  echo "No config file passed, quitting"
+  exit 1
+ else
+  config_file=$1
+fi
+handler()
+{
+  echo "function handler called at $(date)"
+  # Send SIGUSR1 to the captured PID of the accelerate job
+  if [ -n "$accelerate_pid" ]; then
+    echo "Sending SIGUSR1 to accelerate PID: $accelerate_pid"
+    python_id=$(ps --ppid $accelerate_pid -o pid=)
+    kill -USR1 $python_id  # Send SIGUSR1 to the accelerate job
+    sleep 300 # Wait for 5 minutes
+  else
+    echo "No accelerate PID found"
+  fi
+  sbatch ${BASH_SOURCE[0]} $config_file
+}
+# register signal handler
+trap handler SIGUSR1
+echo "Starting job at $(date)"
+#python train_controlnet.py #--config $config_file #& wait
+accelerate launch --config_file accelerator_configs/accelerator_val_config.yaml --multi_gpu train_controlnet.py --config $config_file &
+accelerate_pid=$!
+wait

training/test_dataset.py ADDED Viewed

File without changes

training/train_controlnet.py ADDED Viewed

	@@ -0,0 +1,724 @@

+# Copyright 2024 The HuggingFace Team.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import signal
+import sys
+import threading
+import time
+import cv2
+sys.path.append('..')
+from PIL import Image
+import logging
+import math
+import os
+from pathlib import Path
+import torch
+import transformers
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import DistributedDataParallelKwargs, ProjectConfiguration, set_seed
+from huggingface_hub import create_repo
+from torch.utils.data import DataLoader
+from tqdm.auto import tqdm
+import numpy as np
+from transformers import AutoTokenizer, T5EncoderModel
+import diffusers
+from diffusers import AutoencoderKLCogVideoX, CogVideoXDPMScheduler
+from diffusers.optimization import get_scheduler
+from diffusers.training_utils import (
+    cast_training_params,
+    free_memory,
+)
+from diffusers.utils import check_min_version, export_to_video, is_wandb_available
+from diffusers.utils.torch_utils import is_compiled_module
+from controlnet_datasets import FullMotionBlurDataset, GoPro2xMotionBlurDataset, OutsidePhotosDataset, GoProMotionBlurDataset, BAISTDataset
+from controlnet_pipeline import ControlnetCogVideoXPipeline
+from cogvideo_transformer import CogVideoXTransformer3DModel
+from helpers import random_insert_latent_frame, transform_intervals
+import os
+from utils import save_frames_as_pngs, compute_prompt_embeddings, prepare_rotary_positional_embeddings, encode_prompt, get_optimizer, atomic_save, get_args
+if is_wandb_available():
+    import wandb
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.31.0.dev0")
+logger = get_logger(__name__)
+def log_validation(
+    pipe,
+    args,
+    accelerator,
+    pipeline_args,
+):
+    logger.info(
+        f"Running validation... \n Generating {args.num_validation_videos} videos with prompt: {pipeline_args['prompt']}."
+    )
+    # We train on the simplified learning objective. If we were previously predicting a variance, we need the scheduler to ignore it
+    scheduler_args = {}
+    if "variance_type" in pipe.scheduler.config:
+        variance_type = pipe.scheduler.config.variance_type
+        if variance_type in ["learned", "learned_range"]:
+            variance_type = "fixed_small"
+        scheduler_args["variance_type"] = variance_type
+    pipe.scheduler = CogVideoXDPMScheduler.from_config(pipe.scheduler.config, **scheduler_args)
+    pipe = pipe.to(accelerator.device)
+    # run inference
+    generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) if args.seed else None
+    videos = []
+    for _ in range(args.num_validation_videos):
+        video = pipe(**pipeline_args, generator=generator, output_type="np").frames[0]
+        videos.append(video)
+    free_memory() #delete the pipeline to free up memory
+    return videos
+def main(args):
+    global signal_recieved_time
+    if args.report_to == "wandb" and args.hub_token is not None:
+        raise ValueError(
+            "You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
+            " Please use `huggingface-cli login` to authenticate with the Hub."
+        )
+    if torch.backends.mps.is_available() and args.mixed_precision == "bf16":
+        # due to pytorch#99272, MPS does not yet support bfloat16.
+        raise ValueError(
+            "Mixed precision training with bfloat16 is not supported on MPS. Please use fp16 (recommended) or fp32 instead."
+        )
+    logging_dir = Path(args.output_dir, args.logging_dir)
+    accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
+    kwargs = DistributedDataParallelKwargs(find_unused_parameters=True)
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with=args.report_to,
+        project_config=accelerator_project_config,
+        kwargs_handlers=[kwargs],
+    )
+    # Disable AMP for MPS.
+    if torch.backends.mps.is_available():
+        accelerator.native_amp = False
+    if args.report_to == "wandb":
+        if not is_wandb_available():
+            raise ImportError("Make sure to install wandb if you want to use it for logging during training.")
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_local_main_process:
+        transformers.utils.logging.set_verbosity_warning()
+        diffusers.utils.logging.set_verbosity_info()
+    else:
+        transformers.utils.logging.set_verbosity_error()
+        diffusers.utils.logging.set_verbosity_error()
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+        if args.push_to_hub:
+            repo_id = create_repo(
+                repo_id=args.hub_model_id or Path(args.output_dir).name,
+                exist_ok=True,
+            ).repo_id
+    # Prepare models and scheduler
+    tokenizer = AutoTokenizer.from_pretrained(
+        os.path.join(args.base_dir, args.pretrained_model_name_or_path), subfolder="tokenizer", revision=args.revision
+    )
+    text_encoder = T5EncoderModel.from_pretrained(
+        os.path.join(args.base_dir, args.pretrained_model_name_or_path), subfolder="text_encoder", revision=args.revision
+    )
+    # CogVideoX-2b weights are stored in float16
+    config = CogVideoXTransformer3DModel.load_config(
+    os.path.join(args.base_dir, args.pretrained_model_name_or_path),
+    subfolder="transformer",
+    revision=args.revision,
+    variant=args.variant,
+    )
+    load_dtype = torch.bfloat16 if "5b" in os.path.join(args.base_dir, args.pretrained_model_name_or_path).lower() else torch.float16
+    transformer = CogVideoXTransformer3DModel.from_pretrained(
+        os.path.join(args.base_dir, args.pretrained_model_name_or_path),
+        subfolder="transformer",
+        torch_dtype=load_dtype,
+        revision=args.revision,
+        variant=args.variant,
+        low_cpu_mem_usage=False,
+    )
+    vae = AutoencoderKLCogVideoX.from_pretrained(
+        os.path.join(args.base_dir, args.pretrained_model_name_or_path), subfolder="vae", revision=args.revision, variant=args.variant
+    )
+    scheduler = CogVideoXDPMScheduler.from_pretrained(os.path.join(args.base_dir, args.pretrained_model_name_or_path), subfolder="scheduler")
+    if args.enable_slicing:
+        vae.enable_slicing()
+    if args.enable_tiling:
+        vae.enable_tiling()
+    # We only train the additional adapter controlnet layers
+    text_encoder.requires_grad_(False)
+    transformer.requires_grad_(True)
+    vae.requires_grad_(False)
+    # For mixed precision training we cast all non-trainable weights (vae, text_encoder and transformer) to half-precision
+    # as these weights are only used for inference, keeping weights in full precision is not required.
+    weight_dtype = torch.float32
+    if accelerator.state.deepspeed_plugin:
+        # DeepSpeed is handling precision, use what's in the DeepSpeed config
+        if (
+            "fp16" in accelerator.state.deepspeed_plugin.deepspeed_config
+            and accelerator.state.deepspeed_plugin.deepspeed_config["fp16"]["enabled"]
+        ):
+            weight_dtype = torch.float16
+        if (
+            "bf16" in accelerator.state.deepspeed_plugin.deepspeed_config
+            and accelerator.state.deepspeed_plugin.deepspeed_config["bf16"]["enabled"]
+        ):
+            weight_dtype = torch.float16
+    else:
+        if accelerator.mixed_precision == "fp16":
+            weight_dtype = torch.float16
+        elif accelerator.mixed_precision == "bf16":
+            weight_dtype = torch.bfloat16
+    if torch.backends.mps.is_available() and weight_dtype == torch.bfloat16:
+        # due to pytorch#99272, MPS does not yet support bfloat16.
+        raise ValueError(
+            "Mixed precision training with bfloat16 is not supported on MPS. Please use fp16 (recommended) or fp32 instead."
+        )
+    text_encoder.to(accelerator.device, dtype=weight_dtype)
+    transformer.to(accelerator.device, dtype=weight_dtype)
+    vae.to(accelerator.device, dtype=weight_dtype)
+    if args.gradient_checkpointing:
+        transformer.enable_gradient_checkpointing()
+    def unwrap_model(model):
+        model = accelerator.unwrap_model(model)
+        model = model._orig_mod if is_compiled_module(model) else model
+        return model
+    # Enable TF32 for faster training on Ampere GPUs,
+    # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
+    if args.allow_tf32 and torch.cuda.is_available():
+        torch.backends.cuda.matmul.allow_tf32 = True
+    if args.scale_lr:
+        args.learning_rate = (
+            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
+        )
+    # Make sure the trainable params are in float32.
+    if args.mixed_precision == "fp16":
+        # only upcast trainable parameters into fp32
+        cast_training_params([transformer], dtype=torch.float32)
+    trainable_parameters = list(filter(lambda p: p.requires_grad, transformer.parameters()))
+    # Optimization parameters
+    trainable_parameters_with_lr = {"params": trainable_parameters, "lr": args.learning_rate}
+    params_to_optimize = [trainable_parameters_with_lr]
+    use_deepspeed_optimizer = (
+        accelerator.state.deepspeed_plugin is not None
+        and "optimizer" in accelerator.state.deepspeed_plugin.deepspeed_config
+    )
+    use_deepspeed_scheduler = (
+        accelerator.state.deepspeed_plugin is not None
+        and "scheduler" not in accelerator.state.deepspeed_plugin.deepspeed_config
+    )
+    optimizer = get_optimizer(args, params_to_optimize, use_deepspeed=use_deepspeed_optimizer)
+    # Dataset and DataLoader
+    DATASET_REGISTRY = {
+        "gopro": GoProMotionBlurDataset,
+        "gopro2x": GoPro2xMotionBlurDataset,
+        "full": FullMotionBlurDataset,
+        "baist": BAISTDataset,
+        "outsidephotos": OutsidePhotosDataset,  # val-only special (no split)
+    }
+    if args.dataset not in DATASET_REGISTRY:
+        raise ValueError(f"Unknown dataset: {args.dataset}")
+    train_dataset_class = DATASET_REGISTRY[args.dataset]
+    val_dataset_class   = train_dataset_class
+    common_kwargs = dict(
+        data_dir=os.path.join(args.base_dir, args.video_root_dir),
+        output_dir = args.output_dir,
+        image_size=(args.height, args.width),
+        stride=(args.stride_min, args.stride_max),
+        sample_n_frames=args.max_num_frames,
+        hflip_p=args.hflip_p,
+    )
+    def build_kwargs(is_train: bool):
+        """Return constructor kwargs, adding split"""
+        kw = dict(common_kwargs)
+        kw["split"] = "train" if is_train else args.val_split
+        return kw
+    train_dataset = train_dataset_class(**build_kwargs(is_train=True))
+    val_dataset   = val_dataset_class(**build_kwargs(is_train=False))
+    def encode_video(video):
+        video = video.to(accelerator.device, dtype=vae.dtype)
+        video = video.permute(0, 2, 1, 3, 4)  # [B, C, F, H, W]
+        latent_dist = vae.encode(video).latent_dist.sample() * vae.config.scaling_factor
+        return latent_dist.permute(0, 2, 1, 3, 4).to(memory_format=torch.contiguous_format)
+    def collate_fn(examples):
+        blur_img = [example["blur_img"] for example in examples]
+        videos = [example["video"] for example in examples]
+        if "high_fps_video" in examples[0]:
+            high_fps_videos = [example["high_fps_video"] for example in examples]
+            high_fps_videos = torch.stack(high_fps_videos)
+            high_fps_videos = high_fps_videos.to(memory_format=torch.contiguous_format).float()
+        if "bbx" in examples[0]:
+            bbx = [example["bbx"] for example in examples]
+            bbx = torch.stack(bbx)
+            bbx = bbx.to(memory_format=torch.contiguous_format).float()
+        prompts = [example["caption"] for example in examples]
+        file_names = [example["file_name"] for example in examples]
+        num_frames = [example["num_frames"] for example in examples]
+        input_intervals = [example["input_interval"] for example in examples]
+        output_intervals = [example["output_interval"] for example in examples]
+        videos = torch.stack(videos)
+        videos = videos.to(memory_format=torch.contiguous_format).float()
+        blur_img = torch.stack(blur_img)
+        blur_img = blur_img.to(memory_format=torch.contiguous_format).float()
+        input_intervals = torch.stack(input_intervals)
+        input_intervals = input_intervals.to(memory_format=torch.contiguous_format).float()
+        output_intervals = torch.stack(output_intervals)
+        output_intervals = output_intervals.to(memory_format=torch.contiguous_format).float()
+        out_dict = {
+            "file_names": file_names,
+            "blur_img": blur_img,
+            "videos": videos,
+            "num_frames": num_frames,
+            "prompts": prompts,
+            "input_intervals": input_intervals,
+            "output_intervals": output_intervals,
+        }
+        if "high_fps_video" in examples[0]:
+            out_dict["high_fps_video"] = high_fps_videos
+        if "bbx" in examples[0]:
+            out_dict["bbx"] = bbx
+        return out_dict
+    train_dataloader = DataLoader(
+        train_dataset,
+        batch_size=args.train_batch_size,
+        shuffle=True,
+        collate_fn=collate_fn,
+        num_workers=args.dataloader_num_workers,
+    )
+    val_dataloader = DataLoader(
+        val_dataset,
+        batch_size=1,
+        shuffle=False,
+        collate_fn=collate_fn,
+        num_workers=args.dataloader_num_workers,
+    )
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+    if use_deepspeed_scheduler:
+        from accelerate.utils import DummyScheduler
+        lr_scheduler = DummyScheduler(
+            name=args.lr_scheduler,
+            optimizer=optimizer,
+            total_num_steps=args.max_train_steps * accelerator.num_processes,
+            num_warmup_steps=args.lr_warmup_steps * accelerator.num_processes,
+        )
+    else:
+        lr_scheduler = get_scheduler(
+            args.lr_scheduler,
+            optimizer=optimizer,
+            num_warmup_steps=args.lr_warmup_steps * accelerator.num_processes,
+            num_training_steps=args.max_train_steps * accelerator.num_processes,
+            num_cycles=args.lr_num_cycles,
+            power=args.lr_power,
+        )
+    # Prepare everything with our `accelerator`.
+    transformer, optimizer, train_dataloader, lr_scheduler, val_dataloader = accelerator.prepare(
+        transformer, optimizer, train_dataloader, lr_scheduler, val_dataloader
+    )
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if accelerator.is_main_process:
+        tracker_name = args.tracker_name or "cogvideox-controlnet"
+        accelerator.init_trackers(tracker_name, config=vars(args))
+    accelerator.register_for_checkpointing(transformer, optimizer, lr_scheduler)
+    save_path = os.path.join(args.output_dir, f"checkpoint")
+    #check if the checkpoint already exists
+    if os.path.exists(save_path):
+        accelerator.load_state(save_path)
+        logger.info(f"Loaded state from {save_path}")
+    # Train!
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+    num_trainable_parameters = sum(param.numel() for model in params_to_optimize for param in model["params"])
+    logger.info("***** Running training *****")
+    logger.info(f"  Num trainable parameters = {num_trainable_parameters}")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num batches each epoch = {len(train_dataloader)}")
+    logger.info(f"  Num epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    global_step = 0
+    first_epoch = 0
+    initial_global_step = 0
+    progress_bar = tqdm(
+        range(0, args.max_train_steps),
+        initial=initial_global_step,
+        desc="Steps",
+        # Only show the progress bar once on each machine.
+        disable=not accelerator.is_local_main_process,
+    )
+    vae_scale_factor_spatial = 2 ** (len(vae.config.block_out_channels) - 1)
+    # For DeepSpeed training
+    model_config = transformer.module.config if hasattr(transformer, "module") else transformer.config
+    for epoch in range(first_epoch, args.num_train_epochs):
+        transformer.train()
+        for step, batch in enumerate(train_dataloader):
+            if not args.just_validate:
+                models_to_accumulate = [transformer]
+                with accelerator.accumulate(models_to_accumulate):
+                    model_input = encode_video(batch["videos"]).to(dtype=weight_dtype)  # [B, F, C, H, W]
+                    prompts = batch["prompts"]
+                    image_latent = encode_video(batch["blur_img"]).to(dtype=weight_dtype)  # [B, F, C, H, W]
+                    input_intervals = batch["input_intervals"]
+                    output_intervals = batch["output_intervals"]
+                    batch_size = len(prompts)
+                    # True = use real prompt (conditional); False = drop to empty (unconditional)
+                    guidance_mask = torch.rand(batch_size, device=accelerator.device) >= 0.2
+                    # build a new prompts list: keep the original where mask True, else blank
+                    per_sample_prompts = [
+                        prompts[i] if guidance_mask[i] else ""
+                        for i in range(batch_size)
+                    ]
+                    prompts = per_sample_prompts
+                    # encode prompts
+                    prompt_embeds = compute_prompt_embeddings(
+                        tokenizer,
+                        text_encoder,
+                        prompts,
+                        model_config.max_text_seq_length,
+                        accelerator.device,
+                        weight_dtype,
+                        requires_grad=False,
+                    )
+                    # Sample noise that will be added to the latents
+                    noise = torch.randn_like(model_input)
+                    batch_size, num_frames, num_channels, height, width = model_input.shape
+                    # Sample a random timestep for each image
+                    timesteps = torch.randint(
+                        0, scheduler.config.num_train_timesteps, (batch_size,), device=model_input.device
+                    )
+                    timesteps = timesteps.long()
+                    # Prepare rotary embeds
+                    image_rotary_emb = (
+                        prepare_rotary_positional_embeddings(
+                            height=args.height,
+                            width=args.width,
+                            num_frames=num_frames,
+                            vae_scale_factor_spatial=vae_scale_factor_spatial,
+                            patch_size=model_config.patch_size,
+                            attention_head_dim=model_config.attention_head_dim,
+                            device=accelerator.device,
+                        )
+                        if model_config.use_rotary_positional_embeddings
+                        else None
+                    )
+                    # Add noise to the model input according to the noise magnitude at each timestep (this is the forward diffusion process)
+                    noisy_model_input = scheduler.add_noise(model_input, noise, timesteps)
+                    input_intervals = transform_intervals(input_intervals, frames_per_latent=4)
+                    output_intervals = transform_intervals(output_intervals, frames_per_latent=4)
+                    #first interval is always rep
+                    noisy_model_input, target, condition_mask, intervals = random_insert_latent_frame(image_latent, noisy_model_input, model_input, input_intervals, output_intervals, special_info=args.special_info)
+                    for i in range(batch_size):
+                        if not guidance_mask[i]:
+                            noisy_model_input[i][condition_mask[i]] = 0
+                    # Predict the noise residual
+                    model_output = transformer(
+                        hidden_states=noisy_model_input,
+                        encoder_hidden_states=prompt_embeds,
+                        intervals=intervals,
+                        condition_mask=condition_mask,
+                        timestep=timesteps,
+                        image_rotary_emb=image_rotary_emb,
+                        return_dict=False,
+                    )[0]
+                    #this line below is also scaling the input which is bad - so the model is also learning to scale this input latent somehow
+                    #thus, we need to replace the first frame with the original frame later
+                    model_pred = scheduler.get_velocity(model_output, noisy_model_input, timesteps)
+                    alphas_cumprod = scheduler.alphas_cumprod[timesteps]
+                    weights = 1 / (1 - alphas_cumprod)
+                    while len(weights.shape) < len(model_pred.shape):
+                        weights = weights.unsqueeze(-1)
+                    loss = torch.mean((weights * (model_pred[~condition_mask] - target[~condition_mask]) ** 2).reshape(batch_size, -1), dim=1)
+                    loss = loss.mean()
+                    accelerator.backward(loss)
+                    if accelerator.state.deepspeed_plugin is None:
+                        if not args.just_validate:
+                            optimizer.step()
+                        optimizer.zero_grad()
+                    lr_scheduler.step()
+                    #wait for all processes to finish
+                    accelerator.wait_for_everyone()
+                    # Checks if the accelerator has performed an optimization step behind the scenes
+                    if accelerator.sync_gradients:
+                        progress_bar.update(1)
+                        global_step += 1
+                        if signal_recieved_time != 0:
+                            if time.time() - signal_recieved_time > 60:
+                                print("Signal received, saving state and exiting")
+                                atomic_save(save_path, accelerator)
+                                signal_recieved_time = 0
+                                exit(0)
+                            else:
+                                exit(0)
+                        if accelerator.is_main_process:
+                            if global_step % args.checkpointing_steps == 0:
+                                atomic_save(save_path, accelerator)
+                                logger.info(f"Saved state to {save_path}")
+                    logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
+                    progress_bar.set_postfix(**logs)
+                    accelerator.log(logs, step=global_step)
+                    if global_step >= args.max_train_steps:
+                        break
+            print("Step", step)
+            accelerator.wait_for_everyone()
+            if step == 0 or args.validation_prompt is not None and (step + 1) % args.validation_steps == 0:
+                # Create pipeline
+                pipe = ControlnetCogVideoXPipeline.from_pretrained(
+                    os.path.join(args.base_dir, args.pretrained_model_name_or_path),
+                    transformer=unwrap_model(transformer),
+                    text_encoder=unwrap_model(text_encoder),
+                    vae=unwrap_model(vae),
+                    scheduler=scheduler,
+                    torch_dtype=weight_dtype,
+                )
+                print("Length of validation dataset: ", len(val_dataloader))
+                #create a pipeline per accelerator device (for faster inference)
+                with torch.autocast(str(accelerator.device).replace(":0", ""), enabled=accelerator.mixed_precision == "fp16"):
+                    for batch in val_dataloader:
+                        frame = ((batch["blur_img"][0].permute(0,2,3,1).cpu().numpy() + 1)*127.5).astype(np.uint8)
+                        pipeline_args = {
+                            "prompt": "",
+                            "negative_prompt": "",
+                            "image": frame,
+                            "input_intervals": batch["input_intervals"][0:1],
+                            "output_intervals": batch["output_intervals"][0:1],
+                            "guidance_scale": args.guidance_scale,
+                            "use_dynamic_cfg": args.use_dynamic_cfg,
+                            "height": args.height,
+                            "width": args.width,
+                            "num_frames": args.max_num_frames,
+                            "num_inference_steps": args.num_inference_steps,
+                        }
+                        modified_filenames = []
+                        filenames = batch['file_names']
+                        for file in filenames:
+                            modified_filenames.append(os.path.splitext(file)[0] + ".mp4")
+                        num_frames = batch["num_frames"][0]
+                        #save the gt_video output
+                        if args.dataset not in ["outsidephotos"]:
+                            gt_video = batch["videos"][0].permute(0,2,3,1).cpu().numpy()
+                            gt_video = ((gt_video + 1) * 127.5)/255
+                            gt_video = gt_video[0:num_frames]
+                            for file in modified_filenames:
+                                gt_file_name = os.path.join(args.output_dir, "gt", modified_filenames[0])
+                                os.makedirs(os.path.dirname(gt_file_name), exist_ok=True)
+                                if args.dataset == "baist":
+                                    bbox = batch["bbx"][0].cpu().numpy().astype(np.int32)
+                                    gt_video = gt_video[:, bbox[1]:bbox[3], bbox[0]:bbox[2], :]
+                                    gt_video = np.array([cv2.resize(frame, (160, 192)) for frame in gt_video]) #resize to 192x160
+                                save_frames_as_pngs((gt_video*255).astype(np.uint8), gt_file_name.replace(".mp4", "").replace("gt", "gt_frames"))
+                                export_to_video(gt_video, gt_file_name, fps=20)
+                                if "high_fps_video" in batch:
+                                    high_fps_video = batch["high_fps_video"][0].permute(0,2,3,1).cpu().numpy()
+                                    high_fps_video = ((high_fps_video + 1) * 127.5)/255
+                                    gt_file_name = os.path.join(args.output_dir, "gt_highfps", modified_filenames[0])
+                        #save the blurred image
+                        if args.dataset in ["full", "outsidephotos", "gopro2x", "baist"]:
+                            for file in modified_filenames:
+                                blurry_file_name = os.path.join(args.output_dir, "blurry", modified_filenames[0].replace(".mp4", ".png"))
+                                os.makedirs(os.path.dirname(blurry_file_name), exist_ok=True)
+                                if args.dataset == "baist":
+                                    bbox = batch["bbx"][0].cpu().numpy().astype(np.int32)
+                                    frame0 = frame[0][bbox[1]:bbox[3], bbox[0]:bbox[2], :]
+                                    frame0 = cv2.resize(frame0, (160, 192))  #resize to 192x160
+                                    Image.fromarray(frame0).save(blurry_file_name)
+                                else:
+                                    Image.fromarray(frame[0]).save(blurry_file_name)
+                        videos = log_validation(
+                            pipe=pipe,
+                            args=args,
+                            accelerator=accelerator,
+                            pipeline_args=pipeline_args
+                        )
+                        #save the output video frames as pngs (uncompressed results) and mp4 (compressed results easily viewable)
+                        for i, video in enumerate(videos):
+                            video = video[0:num_frames]
+                            filename = os.path.join(args.output_dir, "deblurred", modified_filenames[0])
+                            os.makedirs(os.path.dirname(filename), exist_ok=True)
+                            if args.dataset == "baist":
+                                bbox = batch["bbx"][0].cpu().numpy().astype(np.int32)
+                                video = video[:, bbox[1]:bbox[3], bbox[0]:bbox[2], :]
+                                video = np.array([cv2.resize(frame, (160, 192)) for frame in video]) #resize to 192x160
+                            save_frames_as_pngs((video*255).astype(np.uint8), filename.replace(".mp4", "").replace("deblurred", "deblurred_frames"))
+                            export_to_video(video, filename, fps=20)
+                            accelerator.wait_for_everyone()
+                if args.just_validate:
+                    exit(0)
+    accelerator.wait_for_everyone()
+    accelerator.end_training()
+signal_recieved_time = 0
+def handle_signal(signum, frame):
+    global signal_recieved_time
+    signal_recieved_time = time.time()
+    print(f"Signal {signum} received at {time.ctime()}")
+    with open("/datasets/sai/gencam/cogvideox/interrupted.txt", "w") as f:
+        f.write(f"Training was interrupted at {time.ctime()}")
+if __name__ == "__main__":
+    args = get_args()
+    print("Registering signal handler")
+    #Register the signal handler (catch SIGUSR1)
+    signal.signal(signal.SIGUSR1, handle_signal)
+    main_thread = threading.Thread(target=main, args=(args,))
+    main_thread.start()
+    while signal_recieved_time!= 0:
+        time.sleep(1)
+    #call main with args as a thread

training/train_controlnet_backup.py ADDED Viewed

	@@ -0,0 +1,1235 @@

+# Copyright 2024 The HuggingFace Team.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import random
+import signal
+import sys
+import threading
+import time
+import cv2
+import yaml
+sys.path.append('..')
+import argparse
+from PIL import Image
+import logging
+import math
+import os
+import shutil
+from pathlib import Path
+from typing import List, Optional, Tuple, Union
+import torch
+import transformers
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import DistributedDataParallelKwargs, ProjectConfiguration, set_seed
+from huggingface_hub import create_repo, upload_folder
+from peft import LoraConfig, get_peft_model_state_dict, set_peft_model_state_dict
+from torch.utils.data import DataLoader, Dataset
+from torchvision import transforms
+from tqdm.auto import tqdm
+import numpy as np
+from decord import VideoReader
+from transformers import AutoTokenizer, T5EncoderModel, T5Tokenizer
+import diffusers
+from diffusers import AutoencoderKLCogVideoX, CogVideoXDPMScheduler
+from diffusers.models.embeddings import get_3d_rotary_pos_embed
+from diffusers.optimization import get_scheduler
+from diffusers.pipelines.cogvideo.pipeline_cogvideox import get_resize_crop_region_for_grid
+from diffusers.training_utils import (
+    cast_training_params,
+    free_memory,
+)
+from diffusers.utils import check_min_version, export_to_video, is_wandb_available
+from diffusers.utils.hub_utils import load_or_create_model_card, populate_model_card
+from diffusers.utils.torch_utils import is_compiled_module
+from controlnet_datasets import AblationFullMotionBlurDataset, AdobeMotionBlurDataset, FullMotionBlurDataset, GoPro2xMotionBlurDataset, GoProLargeMotionBlurDataset, OutsidePhotosDataset, GoProMotionBlurDataset, BAISTDataset, SimpleBAISTDataset
+from controlnet_pipeline import ControlnetCogVideoXPipeline
+from cogvideo_transformer import CogVideoXTransformer3DModel
+from helpers import random_insert_latent_frame, transform_intervals
+import os
+import tempfile
+from atomicwrites import atomic_write
+if is_wandb_available():
+    import wandb
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.31.0.dev0")
+logger = get_logger(__name__)
+def save_frames_as_pngs(video_array,output_dir,
+                        downsample_spatial=1,   # e.g. 2 to halve width & height
+                        downsample_temporal=1): # e.g. 2 to keep every 2nd frame
+    """
+    Save each frame of a (T, H, W, C) numpy array as a PNG with no compression.
+    """
+    assert video_array.ndim == 4 and video_array.shape[-1] == 3, \
+        "Expected (T, H, W, C=3) array"
+    assert video_array.dtype == np.uint8, "Expected uint8 array"
+    os.makedirs(output_dir, exist_ok=True)
+    # temporal downsample
+    frames = video_array[::downsample_temporal]
+    # compute spatially downsampled size
+    T, H, W, _ = frames.shape
+    new_size = (W // downsample_spatial, H // downsample_spatial)
+    # PNG compression param: 0 = no compression
+    png_params = [cv2.IMWRITE_PNG_COMPRESSION, 0]
+    for idx, frame in enumerate(frames):
+        # frame is RGB; convert to BGR for OpenCV
+        bgr = frame[..., ::-1]
+        if downsample_spatial > 1:
+            bgr = cv2.resize(bgr, new_size, interpolation=cv2.INTER_NEAREST)
+        filename = os.path.join(output_dir, "frame_{:05d}.png".format(idx))
+        success = cv2.imwrite(filename, bgr, png_params)
+        if not success:
+            raise RuntimeError("Failed to write frame ")
+def get_args():
+    parser = argparse.ArgumentParser(description="Training script for CogVideoX using config file.")
+    parser.add_argument(
+        "--config",
+        type=str,
+        required=True,
+        help="Path to the YAML config file."
+    )
+    args = parser.parse_args()
+    with open(args.config, "r") as f:
+        config = yaml.safe_load(f)
+    args = argparse.Namespace(**config)
+    # Convert nested config dict to an argparse.Namespace for easier downstream usage
+    return args
+# def read_video(video_path, start_index=0, frames_count=49, stride=1):
+#     video_reader = VideoReader(video_path)
+#     end_index = min(start_index + frames_count * stride, len(video_reader)) - 1
+#     batch_index = np.linspace(start_index, end_index, frames_count, dtype=int)
+#     numpy_video = video_reader.get_batch(batch_index).asnumpy()
+#     return numpy_video
+def log_validation(
+    pipe,
+    args,
+    accelerator,
+    pipeline_args,
+    epoch,
+    is_final_validation: bool = False,
+):
+    logger.info(
+        f"Running validation... \n Generating {args.num_validation_videos} videos with prompt: {pipeline_args['prompt']}."
+    )
+    # We train on the simplified learning objective. If we were previously predicting a variance, we need the scheduler to ignore it
+    scheduler_args = {}
+    if "variance_type" in pipe.scheduler.config:
+        variance_type = pipe.scheduler.config.variance_type
+        if variance_type in ["learned", "learned_range"]:
+            variance_type = "fixed_small"
+        scheduler_args["variance_type"] = variance_type
+    pipe.scheduler = CogVideoXDPMScheduler.from_config(pipe.scheduler.config, **scheduler_args)
+    pipe = pipe.to(accelerator.device)
+    # pipe.set_progress_bar_config(disable=True)
+    # run inference
+    generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) if args.seed else None
+    videos = []
+    for _ in range(args.num_validation_videos):
+        video = pipe(**pipeline_args, generator=generator, output_type="np").frames[0]
+        videos.append(video)
+    free_memory()
+    return videos
+def _get_t5_prompt_embeds(
+    tokenizer: T5Tokenizer,
+    text_encoder: T5EncoderModel,
+    prompt: Union[str, List[str]],
+    num_videos_per_prompt: int = 1,
+    max_sequence_length: int = 226,
+    device: Optional[torch.device] = None,
+    dtype: Optional[torch.dtype] = None,
+    text_input_ids=None,
+):
+    prompt = [prompt] if isinstance(prompt, str) else prompt
+    batch_size = len(prompt)
+    if tokenizer is not None:
+        text_inputs = tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=max_sequence_length,
+            truncation=True,
+            add_special_tokens=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+    else:
+        if text_input_ids is None:
+            raise ValueError("`text_input_ids` must be provided when the tokenizer is not specified.")
+    prompt_embeds = text_encoder(text_input_ids.to(device))[0]
+    prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+    # duplicate text embeddings for each generation per prompt, using mps friendly method
+    _, seq_len, _ = prompt_embeds.shape
+    prompt_embeds = prompt_embeds.repeat(1, num_videos_per_prompt, 1)
+    prompt_embeds = prompt_embeds.view(batch_size * num_videos_per_prompt, seq_len, -1)
+    return prompt_embeds
+def encode_prompt(
+    tokenizer: T5Tokenizer,
+    text_encoder: T5EncoderModel,
+    prompt: Union[str, List[str]],
+    num_videos_per_prompt: int = 1,
+    max_sequence_length: int = 226,
+    device: Optional[torch.device] = None,
+    dtype: Optional[torch.dtype] = None,
+    text_input_ids=None,
+):
+    prompt = [prompt] if isinstance(prompt, str) else prompt
+    prompt_embeds = _get_t5_prompt_embeds(
+        tokenizer,
+        text_encoder,
+        prompt=prompt,
+        num_videos_per_prompt=num_videos_per_prompt,
+        max_sequence_length=max_sequence_length,
+        device=device,
+        dtype=dtype,
+        text_input_ids=text_input_ids,
+    )
+    return prompt_embeds
+def compute_prompt_embeddings(
+    tokenizer, text_encoder, prompt, max_sequence_length, device, dtype, requires_grad: bool = False
+):
+    if requires_grad:
+        prompt_embeds = encode_prompt(
+            tokenizer,
+            text_encoder,
+            prompt,
+            num_videos_per_prompt=1,
+            max_sequence_length=max_sequence_length,
+            device=device,
+            dtype=dtype,
+        )
+    else:
+        with torch.no_grad():
+            prompt_embeds = encode_prompt(
+                tokenizer,
+                text_encoder,
+                prompt,
+                num_videos_per_prompt=1,
+                max_sequence_length=max_sequence_length,
+                device=device,
+                dtype=dtype,
+            )
+    return prompt_embeds
+def prepare_rotary_positional_embeddings(
+    height: int,
+    width: int,
+    num_frames: int,
+    vae_scale_factor_spatial: int = 8,
+    patch_size: int = 2,
+    attention_head_dim: int = 64,
+    device: Optional[torch.device] = None,
+    base_height: int = 480,
+    base_width: int = 720,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    grid_height = height // (vae_scale_factor_spatial * patch_size)
+    grid_width = width // (vae_scale_factor_spatial * patch_size)
+    base_size_width = base_width // (vae_scale_factor_spatial * patch_size)
+    base_size_height = base_height // (vae_scale_factor_spatial * patch_size)
+    grid_crops_coords = get_resize_crop_region_for_grid((grid_height, grid_width), base_size_width, base_size_height)
+    freqs_cos, freqs_sin = get_3d_rotary_pos_embed(
+        embed_dim=attention_head_dim,
+        crops_coords=grid_crops_coords,
+        grid_size=(grid_height, grid_width),
+        temporal_size=num_frames,
+    )
+    freqs_cos = freqs_cos.to(device=device)
+    freqs_sin = freqs_sin.to(device=device)
+    return freqs_cos, freqs_sin
+def get_optimizer(args, params_to_optimize, use_deepspeed: bool = False):
+    # Use DeepSpeed optimzer
+    if use_deepspeed:
+        from accelerate.utils import DummyOptim
+        return DummyOptim(
+            params_to_optimize,
+            lr=args.learning_rate,
+            betas=(args.adam_beta1, args.adam_beta2),
+            eps=args.adam_epsilon,
+            weight_decay=args.adam_weight_decay,
+        )
+    # Optimizer creation
+    supported_optimizers = ["adam", "adamw", "prodigy"]
+    if args.optimizer not in supported_optimizers:
+        logger.warning(
+            f"Unsupported choice of optimizer: {args.optimizer}. Supported optimizers include {supported_optimizers}. Defaulting to AdamW"
+        )
+        args.optimizer = "adamw"
+    if args.use_8bit_adam and not (args.optimizer.lower() not in ["adam", "adamw"]):
+        logger.warning(
+            f"use_8bit_adam is ignored when optimizer is not set to 'Adam' or 'AdamW'. Optimizer was "
+            f"set to {args.optimizer.lower()}"
+        )
+    if args.use_8bit_adam:
+        try:
+            import bitsandbytes as bnb
+        except ImportError:
+            raise ImportError(
+                "To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`."
+            )
+    if args.optimizer.lower() == "adamw":
+        optimizer_class = bnb.optim.AdamW8bit if args.use_8bit_adam else torch.optim.AdamW
+        optimizer = optimizer_class(
+            params_to_optimize,
+            betas=(args.adam_beta1, args.adam_beta2),
+            eps=args.adam_epsilon,
+            weight_decay=args.adam_weight_decay,
+        )
+    elif args.optimizer.lower() == "adam":
+        optimizer_class = bnb.optim.Adam8bit if args.use_8bit_adam else torch.optim.Adam
+        optimizer = optimizer_class(
+            params_to_optimize,
+            betas=(args.adam_beta1, args.adam_beta2),
+            eps=args.adam_epsilon,
+            weight_decay=args.adam_weight_decay,
+        )
+    elif args.optimizer.lower() == "prodigy":
+        try:
+            import prodigyopt
+        except ImportError:
+            raise ImportError("To use Prodigy, please install the prodigyopt library: `pip install prodigyopt`")
+        optimizer_class = prodigyopt.Prodigy
+        if args.learning_rate <= 0.1:
+            logger.warning(
+                "Learning rate is too low. When using prodigy, it's generally better to set learning rate around 1.0"
+            )
+        optimizer = optimizer_class(
+            params_to_optimize,
+            lr=args.learning_rate,
+            betas=(args.adam_beta1, args.adam_beta2),
+            beta3=args.prodigy_beta3,
+            weight_decay=args.adam_weight_decay,
+            eps=args.adam_epsilon,
+            decouple=args.prodigy_decouple,
+            use_bias_correction=args.prodigy_use_bias_correction,
+            safeguard_warmup=args.prodigy_safeguard_warmup,
+        )
+    return optimizer
+def main(args):
+    global signal_recieved_time
+    if args.report_to == "wandb" and args.hub_token is not None:
+        raise ValueError(
+            "You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
+            " Please use `huggingface-cli login` to authenticate with the Hub."
+        )
+    if torch.backends.mps.is_available() and args.mixed_precision == "bf16":
+        # due to pytorch#99272, MPS does not yet support bfloat16.
+        raise ValueError(
+            "Mixed precision training with bfloat16 is not supported on MPS. Please use fp16 (recommended) or fp32 instead."
+        )
+    logging_dir = Path(args.output_dir, args.logging_dir)
+    accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
+    kwargs = DistributedDataParallelKwargs(find_unused_parameters=True)
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with=args.report_to,
+        project_config=accelerator_project_config,
+        kwargs_handlers=[kwargs],
+    )
+    # Disable AMP for MPS.
+    if torch.backends.mps.is_available():
+        accelerator.native_amp = False
+    if args.report_to == "wandb":
+        if not is_wandb_available():
+            raise ImportError("Make sure to install wandb if you want to use it for logging during training.")
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_local_main_process:
+        transformers.utils.logging.set_verbosity_warning()
+        diffusers.utils.logging.set_verbosity_info()
+    else:
+        transformers.utils.logging.set_verbosity_error()
+        diffusers.utils.logging.set_verbosity_error()
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+        if args.push_to_hub:
+            repo_id = create_repo(
+                repo_id=args.hub_model_id or Path(args.output_dir).name,
+                exist_ok=True,
+            ).repo_id
+    # Prepare models and scheduler
+    tokenizer = AutoTokenizer.from_pretrained(
+        os.path.join(args.base_dir, args.pretrained_model_name_or_path), subfolder="tokenizer", revision=args.revision
+    )
+    text_encoder = T5EncoderModel.from_pretrained(
+        os.path.join(args.base_dir, args.pretrained_model_name_or_path), subfolder="text_encoder", revision=args.revision
+    )
+    # CogVideoX-2b weights are stored in float16
+    # CogVideoX-5b and CogVideoX-5b-I2V weights are stored in bfloat16
+    ## TRYING NEW CONFIG LOADING
+    config = CogVideoXTransformer3DModel.load_config(
+    os.path.join(args.base_dir, args.pretrained_model_name_or_path),
+    subfolder="transformer",
+    revision=args.revision,
+    variant=args.variant,
+    )
+    config["ablation_mode"] = args.ablation_mode if hasattr(args, "ablation_mode") else None
+     ##FINISH TRYING NEW CONFIG LOADING
+    load_dtype = torch.bfloat16 if "5b" in os.path.join(args.base_dir, args.pretrained_model_name_or_path).lower() else torch.float16
+    transformer = CogVideoXTransformer3DModel.from_pretrained(
+        os.path.join(args.base_dir, args.pretrained_model_name_or_path),
+        subfolder="transformer",
+        torch_dtype=load_dtype,
+        ablation_mode=args.ablation_mode if hasattr(args, "ablation_mode") else None,
+        revision=args.revision,
+        variant=args.variant,
+        low_cpu_mem_usage=False,
+    )
+    vae = AutoencoderKLCogVideoX.from_pretrained(
+        os.path.join(args.base_dir, args.pretrained_model_name_or_path), subfolder="vae", revision=args.revision, variant=args.variant
+    )
+    scheduler = CogVideoXDPMScheduler.from_pretrained(os.path.join(args.base_dir, args.pretrained_model_name_or_path), subfolder="scheduler")
+    if args.enable_slicing:
+        vae.enable_slicing()
+    if args.enable_tiling:
+        vae.enable_tiling()
+    # We only train the additional adapter controlnet layers
+    text_encoder.requires_grad_(False)
+    transformer.requires_grad_(True)
+    vae.requires_grad_(False)
+    # For mixed precision training we cast all non-trainable weights (vae, text_encoder and transformer) to half-precision
+    # as these weights are only used for inference, keeping weights in full precision is not required.
+    weight_dtype = torch.float32
+    if accelerator.state.deepspeed_plugin:
+        # DeepSpeed is handling precision, use what's in the DeepSpeed config
+        if (
+            "fp16" in accelerator.state.deepspeed_plugin.deepspeed_config
+            and accelerator.state.deepspeed_plugin.deepspeed_config["fp16"]["enabled"]
+        ):
+            weight_dtype = torch.float16
+        if (
+            "bf16" in accelerator.state.deepspeed_plugin.deepspeed_config
+            and accelerator.state.deepspeed_plugin.deepspeed_config["bf16"]["enabled"]
+        ):
+            weight_dtype = torch.float16
+    else:
+        if accelerator.mixed_precision == "fp16":
+            weight_dtype = torch.float16
+        elif accelerator.mixed_precision == "bf16":
+            weight_dtype = torch.bfloat16
+    if torch.backends.mps.is_available() and weight_dtype == torch.bfloat16:
+        # due to pytorch#99272, MPS does not yet support bfloat16.
+        raise ValueError(
+            "Mixed precision training with bfloat16 is not supported on MPS. Please use fp16 (recommended) or fp32 instead."
+        )
+    text_encoder.to(accelerator.device, dtype=weight_dtype)
+    transformer.to(accelerator.device, dtype=weight_dtype)
+    vae.to(accelerator.device, dtype=weight_dtype)
+    if args.gradient_checkpointing:
+        transformer.enable_gradient_checkpointing()
+    def unwrap_model(model):
+        model = accelerator.unwrap_model(model)
+        model = model._orig_mod if is_compiled_module(model) else model
+        return model
+    # Enable TF32 for faster training on Ampere GPUs,
+    # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
+    if args.allow_tf32 and torch.cuda.is_available():
+        torch.backends.cuda.matmul.allow_tf32 = True
+    if args.scale_lr:
+        args.learning_rate = (
+            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
+        )
+    # Make sure the trainable params are in float32.
+    if args.mixed_precision == "fp16":
+        # only upcast trainable parameters into fp32
+        cast_training_params([transformer], dtype=torch.float32)
+    trainable_parameters = list(filter(lambda p: p.requires_grad, transformer.parameters()))
+    # Optimization parameters
+    trainable_parameters_with_lr = {"params": trainable_parameters, "lr": args.learning_rate}
+    params_to_optimize = [trainable_parameters_with_lr]
+    use_deepspeed_optimizer = (
+        accelerator.state.deepspeed_plugin is not None
+        and "optimizer" in accelerator.state.deepspeed_plugin.deepspeed_config
+    )
+    use_deepspeed_scheduler = (
+        accelerator.state.deepspeed_plugin is not None
+        and "scheduler" not in accelerator.state.deepspeed_plugin.deepspeed_config
+    )
+    optimizer = get_optimizer(args, params_to_optimize, use_deepspeed=use_deepspeed_optimizer)
+    # Dataset and DataLoader
+    if args.dataset == "adobe":
+        train_dataset = AdobeMotionBlurDataset(
+            data_dir=os.path.join(args.base_dir, args.video_root_dir),
+            split = "train",
+            image_size=(args.height, args.width),
+            stride=(args.stride_min, args.stride_max),
+            sample_n_frames=args.max_num_frames,
+            hflip_p=args.hflip_p,
+        )
+    elif args.dataset == "gopro":
+        train_dataset = GoProMotionBlurDataset(
+            data_dir=os.path.join(args.base_dir, args.video_root_dir),
+            split = "train",
+            image_size=(args.height, args.width),
+            stride=(args.stride_min, args.stride_max),
+            sample_n_frames=args.max_num_frames,
+            hflip_p=args.hflip_p,
+        )
+    elif args.dataset == "gopro2x":
+        train_dataset = GoPro2xMotionBlurDataset(
+            data_dir=os.path.join(args.base_dir, args.video_root_dir),
+            split = "train",
+            image_size=(args.height, args.width),
+            stride=(args.stride_min, args.stride_max),
+            sample_n_frames=args.max_num_frames,
+            hflip_p=args.hflip_p,
+        )
+    elif args.dataset == "goprolarge":
+        train_dataset = GoProLargeMotionBlurDataset(
+            data_dir=os.path.join(args.base_dir, args.video_root_dir),
+            split = "train",
+            image_size=(args.height, args.width),
+            stride=(args.stride_min, args.stride_max),
+            sample_n_frames=args.max_num_frames,
+            hflip_p=args.hflip_p,
+        )
+    elif args.dataset == "full":
+        train_dataset = FullMotionBlurDataset(
+            data_dir=os.path.join(args.base_dir, args.video_root_dir),
+            split = "train",
+            image_size=(args.height, args.width),
+            stride=(args.stride_min, args.stride_max),
+            sample_n_frames=args.max_num_frames,
+            hflip_p=args.hflip_p,
+        )
+    elif args.dataset == "fullablation":
+        train_dataset = AblationFullMotionBlurDataset(
+            data_dir=os.path.join(args.base_dir, args.video_root_dir),
+            split = "train",
+            image_size=(args.height, args.width),
+            stride=(args.stride_min, args.stride_max),
+            sample_n_frames=args.max_num_frames,
+            hflip_p=args.hflip_p,
+            ablation_mode = args.ablation_mode, #this is not called for now
+        )
+    elif args.dataset == "baist":
+        train_dataset = BAISTDataset(
+            data_dir=os.path.join(args.base_dir, args.video_root_dir),
+            split = "train",
+            image_size=(args.height, args.width),
+            stride=(args.stride_min, args.stride_max),
+            sample_n_frames=args.max_num_frames,
+            hflip_p=args.hflip_p,
+        ) #this is not called for now
+    elif args.dataset == "simplebaist":
+        train_dataset = SimpleBAISTDataset(
+            data_dir=os.path.join(args.base_dir, args.video_root_dir),
+            split = "train",
+            image_size=(args.height, args.width),
+            stride=(args.stride_min, args.stride_max),
+            sample_n_frames=args.max_num_frames,
+            hflip_p=args.hflip_p,
+        )
+    if args.dataset == "adobe":
+        val_dataset = AdobeMotionBlurDataset(
+            data_dir=os.path.join(args.base_dir, args.video_root_dir),
+            split = args.val_split,
+            image_size=(args.height, args.width),
+            stride=(args.stride_min, args.stride_max),
+            sample_n_frames=args.max_num_frames,
+            hflip_p=args.hflip_p,
+        )
+    elif args.dataset == "outsidephotos":
+        val_dataset = OutsidePhotosDataset(
+            data_dir=os.path.join(args.base_dir, args.video_root_dir),
+            image_size=(args.height, args.width),
+            stride=(args.stride_min, args.stride_max),
+            sample_n_frames=args.max_num_frames,
+            hflip_p=args.hflip_p,
+        )
+        train_dataset = val_dataset #dummy dataset
+    elif args.dataset == "gopro":
+        val_dataset = GoProMotionBlurDataset(
+            data_dir=os.path.join(args.base_dir, args.video_root_dir),
+            split = args.val_split,
+            image_size=(args.height, args.width),
+            stride=(args.stride_min, args.stride_max),
+            sample_n_frames=args.max_num_frames,
+            hflip_p=args.hflip_p,
+        )
+    elif args.dataset == "gopro2x":
+        val_dataset = GoPro2xMotionBlurDataset(
+            data_dir=os.path.join(args.base_dir, args.video_root_dir),
+            split = args.val_split,
+            image_size=(args.height, args.width),
+            stride=(args.stride_min, args.stride_max),
+            sample_n_frames=args.max_num_frames,
+            hflip_p=args.hflip_p,
+        )
+    elif args.dataset == "goprolarge":
+        val_dataset = GoProLargeMotionBlurDataset(
+            data_dir=os.path.join(args.base_dir, args.video_root_dir),
+            split = args.val_split,
+            image_size=(args.height, args.width),
+            stride=(args.stride_min, args.stride_max),
+            sample_n_frames=args.max_num_frames,
+            hflip_p=args.hflip_p,
+        )
+    elif args.dataset == "full":
+        val_dataset = FullMotionBlurDataset(
+            data_dir=os.path.join(args.base_dir, args.video_root_dir),
+            split = args.val_split,
+            image_size=(args.height, args.width),
+            stride=(args.stride_min, args.stride_max),
+            sample_n_frames=args.max_num_frames,
+            hflip_p=args.hflip_p,
+        )
+    elif args.dataset == "fullablation":
+        val_dataset = AblationFullMotionBlurDataset(
+            data_dir=os.path.join(args.base_dir, args.video_root_dir),
+            split = args.val_split,
+            image_size=(args.height, args.width),
+            stride=(args.stride_min, args.stride_max),
+            sample_n_frames=args.max_num_frames,
+            hflip_p=args.hflip_p,
+            ablation_mode = args.ablation_mode, #this is not called for now
+        )
+    elif args.dataset == "baist":
+        val_dataset = BAISTDataset(
+            data_dir=os.path.join(args.base_dir, args.video_root_dir),
+            split = args.val_split,
+            image_size=(args.height, args.width),
+            stride=(args.stride_min, args.stride_max),
+            sample_n_frames=args.max_num_frames,
+            hflip_p=args.hflip_p,
+        )
+    elif args.dataset == "simplebaist":
+        val_dataset = SimpleBAISTDataset(
+            data_dir=os.path.join(args.base_dir, args.video_root_dir),
+            split = args.val_split,
+            image_size=(args.height, args.width),
+            stride=(args.stride_min, args.stride_max),
+            sample_n_frames=args.max_num_frames,
+            hflip_p=args.hflip_p,
+        )
+    def encode_video(video):
+        video = video.to(accelerator.device, dtype=vae.dtype)
+        video = video.permute(0, 2, 1, 3, 4)  # [B, C, F, H, W]
+        latent_dist = vae.encode(video).latent_dist.sample() * vae.config.scaling_factor
+        return latent_dist.permute(0, 2, 1, 3, 4).to(memory_format=torch.contiguous_format)
+    # def atomic_save(save_path, accelerator):
+    #     dir_name = os.path.dirname(save_path)
+    #     with tempfile.NamedTemporaryFile(delete=False, dir=dir_name) as tmp_file:
+    #         tmp_path = tmp_file.name
+    #         # Close the file so that it can be moved later
+    #     #delete anything at the tmp_path
+    #     if accelerator.is_main_process:
+    #         accelerator.save_state(tmp_path) #just a backup incase things go crazy
+    #         accelerator.save_state(save_path)
+    #     os.remove(tmp_path)
+    #     accelerator.wait_for_everyone()
+    def atomic_save(save_path, accelerator):
+        parent = os.path.dirname(save_path)
+        tmp_dir = tempfile.mkdtemp(dir=parent)
+        backup_dir = save_path + "_backup"
+        try:
+            # Save state into the temp directory
+            accelerator.save_state(tmp_dir)
+            # Backup existing save_path if it exists
+            if os.path.exists(save_path):
+                os.rename(save_path, backup_dir)
+            # Atomically move temp directory into place
+            os.rename(tmp_dir, save_path)
+            # Clean up the backup directory
+            if os.path.exists(backup_dir):
+                shutil.rmtree(backup_dir)
+        except Exception as e:
+            # Clean up temp directory on failure
+            if os.path.exists(tmp_dir):
+                shutil.rmtree(tmp_dir)
+            # Restore from backup if replacement failed
+            if os.path.exists(backup_dir):
+                if os.path.exists(save_path):
+                    shutil.rmtree(save_path)
+                os.rename(backup_dir, save_path)
+            raise e
+    def collate_fn(examples):
+        blur_img = [example["blur_img"] for example in examples]
+        videos = [example["video"] for example in examples]
+        if "high_fps_video" in examples[0]:
+            high_fps_videos = [example["high_fps_video"] for example in examples]
+            high_fps_videos = torch.stack(high_fps_videos)
+            high_fps_videos = high_fps_videos.to(memory_format=torch.contiguous_format).float()
+        if "bbx" in examples[0]:
+            bbx = [example["bbx"] for example in examples]
+            bbx = torch.stack(bbx)
+            bbx = bbx.to(memory_format=torch.contiguous_format).float()
+        prompts = [example["caption"] for example in examples]
+        file_names = [example["file_name"] for example in examples]
+        num_frames = [example["num_frames"] for example in examples]
+        # if full_file_names in examples[0]:
+        #     full_file_names = [example["full_file_name"] for example in examples]
+        input_intervals = [example["input_interval"] for example in examples]
+        output_intervals = [example["output_interval"] for example in examples]
+        ablation_condition = [example["ablation_condition"] for example in examples] if "ablation_condition" in examples[0] else None
+        videos = torch.stack(videos)
+        videos = videos.to(memory_format=torch.contiguous_format).float()
+        blur_img = torch.stack(blur_img)
+        blur_img = blur_img.to(memory_format=torch.contiguous_format).float()
+        input_intervals = torch.stack(input_intervals)
+        if args.dataset == "gopro":
+            input_intervals = input_intervals.to(memory_format=torch.contiguous_format).long()  #this is a bug, but I trained it like this on GOPRO (sets intervals all to 0), model doesn't need intervals for this dataset cause its always 7 frames in the same spacing
+        else:
+            input_intervals = input_intervals.to(memory_format=torch.contiguous_format).float()
+        output_intervals = torch.stack(output_intervals)
+        if args.dataset == "gopro":
+            output_intervals = output_intervals.to(memory_format=torch.contiguous_format).long()  #this is a bug, but I trained it like this on GOPRO (sets intervals all to 0), model doesn't need intervals for this dataset cause its always 7 frames in the same spacing
+        else:
+            output_intervals = output_intervals.to(memory_format=torch.contiguous_format).float()
+        #just used for ablation studies
+        ablation_condition = torch.stack(ablation_condition) if ablation_condition is not None else None
+        if ablation_condition is not None:
+            ablation_condition = ablation_condition.to(memory_format=torch.contiguous_format).float()
+        out_dict = {
+            "file_names": file_names,
+            "blur_img": blur_img,
+            "videos": videos,
+            "num_frames": num_frames,
+            "prompts": prompts,
+            "input_intervals": input_intervals,
+            "output_intervals": output_intervals,
+        }
+        if "high_fps_video" in examples[0]:
+            out_dict["high_fps_video"] = high_fps_videos
+        if "bbx" in examples[0]:
+            out_dict["bbx"] = bbx
+        if ablation_condition is not None:
+            out_dict["ablation_condition"] = ablation_condition
+        return out_dict
+    train_dataloader = DataLoader(
+        train_dataset,
+        batch_size=args.train_batch_size,
+        shuffle=True,
+        collate_fn=collate_fn,
+        num_workers=args.dataloader_num_workers,
+    )
+    val_dataloader = DataLoader(
+        val_dataset,
+        batch_size=1,
+        shuffle=False,
+        collate_fn=collate_fn,
+        num_workers=args.dataloader_num_workers,
+    )
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+    if use_deepspeed_scheduler:
+        from accelerate.utils import DummyScheduler
+        lr_scheduler = DummyScheduler(
+            name=args.lr_scheduler,
+            optimizer=optimizer,
+            total_num_steps=args.max_train_steps * accelerator.num_processes,
+            num_warmup_steps=args.lr_warmup_steps * accelerator.num_processes,
+        )
+    else:
+        lr_scheduler = get_scheduler(
+            args.lr_scheduler,
+            optimizer=optimizer,
+            num_warmup_steps=args.lr_warmup_steps * accelerator.num_processes,
+            num_training_steps=args.max_train_steps * accelerator.num_processes,
+            num_cycles=args.lr_num_cycles,
+            power=args.lr_power,
+        )
+    # Prepare everything with our `accelerator`.
+    transformer, optimizer, train_dataloader, lr_scheduler, val_dataloader = accelerator.prepare(
+        transformer, optimizer, train_dataloader, lr_scheduler, val_dataloader
+    )
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if accelerator.is_main_process:
+        tracker_name = args.tracker_name or "cogvideox-controlnet"
+        accelerator.init_trackers(tracker_name, config=vars(args))
+    accelerator.register_for_checkpointing(transformer, optimizer, lr_scheduler)
+    save_path = os.path.join(args.output_dir, f"checkpoint")
+    #check if the checkpoint already exists
+    if os.path.exists(save_path):
+        accelerator.load_state(save_path)
+        logger.info(f"Loaded state from {save_path}")
+    # Train!
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+    num_trainable_parameters = sum(param.numel() for model in params_to_optimize for param in model["params"])
+    logger.info("***** Running training *****")
+    logger.info(f"  Num trainable parameters = {num_trainable_parameters}")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num batches each epoch = {len(train_dataloader)}")
+    logger.info(f"  Num epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    global_step = 0
+    first_epoch = 0
+    initial_global_step = 0
+    progress_bar = tqdm(
+        range(0, args.max_train_steps),
+        initial=initial_global_step,
+        desc="Steps",
+        # Only show the progress bar once on each machine.
+        disable=not accelerator.is_local_main_process,
+    )
+    vae_scale_factor_spatial = 2 ** (len(vae.config.block_out_channels) - 1)
+    # For DeepSpeed training
+    model_config = transformer.module.config if hasattr(transformer, "module") else transformer.config
+    for epoch in range(first_epoch, args.num_train_epochs):
+        transformer.train()
+        for step, batch in enumerate(train_dataloader):
+            if not args.just_validate:
+                models_to_accumulate = [transformer]
+                with accelerator.accumulate(models_to_accumulate):
+                    model_input = encode_video(batch["videos"]).to(dtype=weight_dtype)  # [B, F, C, H, W]
+                    prompts = batch["prompts"]
+                    image_latent = encode_video(batch["blur_img"]).to(dtype=weight_dtype)  # [B, F, C, H, W]
+                    input_intervals = batch["input_intervals"]
+                    output_intervals = batch["output_intervals"]
+                    ablation_condition = batch["ablation_condition"] if "ablation_condition" in batch else None
+                    batch_size = len(prompts)
+                    # True = use real prompt (conditional); False = drop to empty (unconditional)
+                    guidance_mask = torch.rand(batch_size, device=accelerator.device) >= 0.2
+                    # build a new prompts list: keep the original where mask True, else blank
+                    per_sample_prompts = [
+                        prompts[i] if guidance_mask[i] else ""
+                        for i in range(batch_size)
+                    ]
+                    prompts = per_sample_prompts
+                    # encode prompts
+                    prompt_embeds = compute_prompt_embeddings(
+                        tokenizer,
+                        text_encoder,
+                        prompts,
+                        model_config.max_text_seq_length,
+                        accelerator.device,
+                        weight_dtype,
+                        requires_grad=False,
+                    )
+                    # Sample noise that will be added to the latents
+                    noise = torch.randn_like(model_input)
+                    batch_size, num_frames, num_channels, height, width = model_input.shape
+                    # Sample a random timestep for each image
+                    timesteps = torch.randint(
+                        0, scheduler.config.num_train_timesteps, (batch_size,), device=model_input.device
+                    )
+                    timesteps = timesteps.long()
+                    # Prepare rotary embeds
+                    image_rotary_emb = (
+                        prepare_rotary_positional_embeddings(
+                            height=args.height,
+                            width=args.width,
+                            num_frames=num_frames,
+                            vae_scale_factor_spatial=vae_scale_factor_spatial,
+                            patch_size=model_config.patch_size,
+                            attention_head_dim=model_config.attention_head_dim,
+                            device=accelerator.device,
+                        )
+                        if model_config.use_rotary_positional_embeddings
+                        else None
+                    )
+                    # Add noise to the model input according to the noise magnitude at each timestep (this is the forward diffusion process)
+                    noisy_model_input = scheduler.add_noise(model_input, noise, timesteps)
+                    input_intervals = transform_intervals(input_intervals, frames_per_latent=4)
+                    output_intervals = transform_intervals(output_intervals, frames_per_latent=4)
+                    #first interval is always rep
+                    noisy_model_input, target, condition_mask, intervals = random_insert_latent_frame(image_latent, noisy_model_input, model_input, input_intervals, output_intervals, special_info=args.special_info)
+                    for i in range(batch_size):
+                        if not guidance_mask[i]:
+                            noisy_model_input[i][condition_mask[i]] = 0
+                    # Predict the noise residual
+                    model_output = transformer(
+                        hidden_states=noisy_model_input,
+                        encoder_hidden_states=prompt_embeds,
+                        intervals=intervals,
+                        condition_mask=condition_mask,
+                        timestep=timesteps,
+                        image_rotary_emb=image_rotary_emb,
+                        return_dict=False,
+                        ablation_condition = ablation_condition
+                    )[0]
+                    #this line below is also scaling the input which is bad - so the model is also learning to scale this input latent somehow
+                    #thus, we need to replace the first frame with the original frame later
+                    model_pred = scheduler.get_velocity(model_output, noisy_model_input, timesteps)
+                    alphas_cumprod = scheduler.alphas_cumprod[timesteps]
+                    weights = 1 / (1 - alphas_cumprod)
+                    while len(weights.shape) < len(model_pred.shape):
+                        weights = weights.unsqueeze(-1)
+                    loss = torch.mean((weights * (model_pred[~condition_mask] - target[~condition_mask]) ** 2).reshape(batch_size, -1), dim=1)
+                    loss = loss.mean()
+                    accelerator.backward(loss)
+                    if accelerator.state.deepspeed_plugin is None:
+                        if not args.just_validate:
+                            optimizer.step()
+                        optimizer.zero_grad()
+                    lr_scheduler.step()
+                    #wait for all processes to finish
+                    accelerator.wait_for_everyone()
+                    # Checks if the accelerator has performed an optimization step behind the scenes
+                    if accelerator.sync_gradients:
+                        progress_bar.update(1)
+                        global_step += 1
+                        if signal_recieved_time != 0:
+                            if time.time() - signal_recieved_time > 60:
+                                print("Signal received, saving state and exiting")
+                                #accelerator.save_state(save_path)
+                                atomic_save(save_path, accelerator)
+                                signal_recieved_time = 0
+                                exit(0)
+                            else:
+                                exit(0)
+                        if accelerator.is_main_process:
+                            if global_step % args.checkpointing_steps == 0:
+                                #accelerator.save_state(save_path)
+                                atomic_save(save_path, accelerator)
+                                logger.info(f"Saved state to {save_path}")
+                    logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
+                    progress_bar.set_postfix(**logs)
+                    accelerator.log(logs, step=global_step)
+                    if global_step >= args.max_train_steps:
+                        break
+            print("Step", step)
+            accelerator.wait_for_everyone()
+            if step == 0 or args.validation_prompt is not None and (step + 1) % args.validation_steps == 0:
+                # Create pipeline
+                pipe = ControlnetCogVideoXPipeline.from_pretrained(
+                    os.path.join(args.base_dir, args.pretrained_model_name_or_path),
+                    transformer=unwrap_model(transformer),
+                    text_encoder=unwrap_model(text_encoder),
+                    vae=unwrap_model(vae),
+                    scheduler=scheduler,
+                    torch_dtype=weight_dtype,
+                )
+                print("Length of validation dataset: ", len(val_dataloader))
+                #create a pipeline per accelerator device (for faster inference)
+                with torch.autocast(str(accelerator.device).replace(":0", ""), enabled=accelerator.mixed_precision == "fp16"):
+                    for batch in val_dataloader:
+                        frame = ((batch["blur_img"][0].permute(0,2,3,1).cpu().numpy() + 1)*127.5).astype(np.uint8)
+                        pipeline_args = {
+                            "prompt": "",
+                            "negative_prompt": "",
+                            "image": frame,
+                            "input_intervals": batch["input_intervals"][0:1],
+                            "output_intervals": batch["output_intervals"][0:1],
+                            "ablation_condition": batch["ablation_condition"][0:1] if "ablation_condition" in batch else None,
+                            "guidance_scale": args.guidance_scale,
+                            "use_dynamic_cfg": args.use_dynamic_cfg,
+                            "height": args.height,
+                            "width": args.width,
+                            "num_frames": args.max_num_frames,
+                            "num_inference_steps": args.num_inference_steps,
+                        }
+                        modified_filenames = []
+                        filenames = batch['file_names']
+                        for file in filenames:
+                            modified_filenames.append(os.path.splitext(file)[0] + ".mp4")
+                        num_frames = batch["num_frames"][0]
+                        #save the gt_video output
+                        if args.dataset not in ["outsidephotos"]:
+                            gt_video = batch["videos"][0].permute(0,2,3,1).cpu().numpy()
+                            gt_video = ((gt_video + 1) * 127.5)/255
+                            gt_video = gt_video[0:num_frames]
+                            for file in modified_filenames:
+                                #create the directory if it does not exist
+                                gt_file_name = os.path.join(args.output_dir, "gt", modified_filenames[0])
+                                os.makedirs(os.path.dirname(gt_file_name), exist_ok=True)
+                                if args.dataset in ["baist", "simplebaist"]:
+                                    bbox = batch["bbx"][0].cpu().numpy().astype(np.int32)
+                                    gt_video = gt_video[:, bbox[1]:bbox[3], bbox[0]:bbox[2], :]
+                                    gt_video = np.array([cv2.resize(frame, (160, 192)) for frame in gt_video])
+                                save_frames_as_pngs((gt_video*255).astype(np.uint8), gt_file_name.replace(".mp4", "").replace("gt", "gt_frames"))
+                                export_to_video(gt_video, gt_file_name, fps=20)
+                                if "high_fps_video" in batch:
+                                    high_fps_video = batch["high_fps_video"][0].permute(0,2,3,1).cpu().numpy()
+                                    high_fps_video = ((high_fps_video + 1) * 127.5)/255
+                                    gt_file_name = os.path.join(args.output_dir, "gt_highfps", modified_filenames[0])
+                        if args.dataset in ["adobe", "full", "baist", "outsidephotos", "gopro2x", "goprolarge", "simplebaist"]:
+                            for file in modified_filenames:
+                                #create the directory if it does not exist
+                                blurry_file_name = os.path.join(args.output_dir, "blurry", modified_filenames[0].replace(".mp4", ".png"))
+                                #save the blurry image
+                                os.makedirs(os.path.dirname(blurry_file_name), exist_ok=True)
+                                if args.dataset in ["baist", "simplebaist"]:
+                                    bbox = batch["bbx"][0].cpu().numpy().astype(np.int32)
+                                    frame0 = frame[0][bbox[1]:bbox[3], bbox[0]:bbox[2], :]
+                                    #resize to  192x160
+                                    frame0 = cv2.resize(frame0, (160, 192))
+                                    Image.fromarray(frame0).save(blurry_file_name)
+                                else:
+                                    Image.fromarray(frame[0]).save(blurry_file_name)
+                        videos = log_validation(
+                            pipe=pipe,
+                            args=args,
+                            accelerator=accelerator,
+                            pipeline_args=pipeline_args,
+                            epoch=epoch,
+                        )
+                        for i, video in enumerate(videos):
+                            prompt = (
+                                pipeline_args["prompt"][:25]
+                                .replace(" ", "_")
+                                .replace(" ", "_")
+                                .replace("'", "_")
+                                .replace('"', "_")
+                                .replace("/", "_")
+                            )
+                            video = video[0:num_frames]
+                            filename = os.path.join(args.output_dir, "deblurred", modified_filenames[0])
+                            print("Deblurred file name", filename)
+                            os.makedirs(os.path.dirname(filename), exist_ok=True)
+                            if args.dataset in ["baist", "simplebaist"]:
+                                bbox = batch["bbx"][0].cpu().numpy().astype(np.int32)
+                                video = video[:, bbox[1]:bbox[3], bbox[0]:bbox[2], :]
+                                #resize to  192x160
+                                video = np.array([cv2.resize(frame, (160, 192)) for frame in video])
+                            save_frames_as_pngs((video*255).astype(np.uint8), filename.replace(".mp4", "").replace("deblurred", "deblurred_frames"))
+                            export_to_video(video, filename, fps=20)
+                            accelerator.wait_for_everyone()
+                if args.just_validate:
+                    exit(0)
+    accelerator.wait_for_everyone()
+    accelerator.end_training()
+signal_recieved_time = 0
+def handle_signal(signum, frame):
+    global signal_recieved_time
+    signal_recieved_time = time.time()
+    print(f"Signal {signum} received at {time.ctime()}")
+    with open("/datasets/sai/gencam/cogvideox/interrupted.txt", "w") as f:
+        f.write(f"Training was interrupted at {time.ctime()}")
+if __name__ == "__main__":
+    args = get_args()
+    print("Registering signal handler")
+    #Register the signal handler (catch SIGUSR1)
+    signal.signal(signal.SIGUSR1, handle_signal)
+    main_thread = threading.Thread(target=main, args=(args,))
+    main_thread.start()
+    print("SIGNAL RECIEVED TIME", signal_recieved_time)
+    while signal_recieved_time!= 0:
+        time.sleep(1)
+    #call main with args as a thread

training/utils.py ADDED Viewed

	@@ -0,0 +1,299 @@

+import os
+from typing import List, Optional, Union, Tuple
+import torch
+from transformers import T5EncoderModel, T5Tokenizer
+import numpy as np
+import cv2
+from diffusers.models.embeddings import get_3d_rotary_pos_embed
+from diffusers.pipelines.cogvideo.pipeline_cogvideox import get_resize_crop_region_for_grid
+from accelerate.logging import get_logger
+import tempfile
+import argparse
+import yaml
+import shutil
+logger = get_logger(__name__)
+def get_args():
+    parser = argparse.ArgumentParser(description="Training script for CogVideoX using config file.")
+    parser.add_argument(
+        "--config",
+        type=str,
+        required=True,
+        help="Path to the YAML config file."
+    )
+    args = parser.parse_args()
+    with open(args.config, "r") as f:
+        config = yaml.safe_load(f)
+    args = argparse.Namespace(**config)
+    # Convert nested config dict to an argparse.Namespace for easier downstream usage
+    return args
+def atomic_save(save_path, accelerator):
+    parent = os.path.dirname(save_path)
+    tmp_dir = tempfile.mkdtemp(dir=parent)
+    backup_dir = save_path + "_backup"
+    try:
+        # Save state into the temp directory
+        accelerator.save_state(tmp_dir)
+        # Backup existing save_path if it exists
+        if os.path.exists(save_path):
+            os.rename(save_path, backup_dir)
+        # Atomically move temp directory into place
+        os.rename(tmp_dir, save_path)
+        # Clean up the backup directory
+        if os.path.exists(backup_dir):
+            shutil.rmtree(backup_dir)
+    except Exception as e:
+        # Clean up temp directory on failure
+        if os.path.exists(tmp_dir):
+            shutil.rmtree(tmp_dir)
+        # Restore from backup if replacement failed
+        if os.path.exists(backup_dir):
+            if os.path.exists(save_path):
+                shutil.rmtree(save_path)
+            os.rename(backup_dir, save_path)
+        raise e
+def get_optimizer(args, params_to_optimize, use_deepspeed: bool = False):
+    # Use DeepSpeed optimzer
+    if use_deepspeed:
+        from accelerate.utils import DummyOptim
+        return DummyOptim(
+            params_to_optimize,
+            lr=args.learning_rate,
+            betas=(args.adam_beta1, args.adam_beta2),
+            eps=args.adam_epsilon,
+            weight_decay=args.adam_weight_decay,
+        )
+    # Optimizer creation
+    supported_optimizers = ["adam", "adamw", "prodigy"]
+    if args.optimizer not in supported_optimizers:
+        logger.warning(
+            f"Unsupported choice of optimizer: {args.optimizer}. Supported optimizers include {supported_optimizers}. Defaulting to AdamW"
+        )
+        args.optimizer = "adamw"
+    if args.use_8bit_adam and not (args.optimizer.lower() not in ["adam", "adamw"]):
+        logger.warning(
+            f"use_8bit_adam is ignored when optimizer is not set to 'Adam' or 'AdamW'. Optimizer was "
+            f"set to {args.optimizer.lower()}"
+        )
+    if args.use_8bit_adam:
+        try:
+            import bitsandbytes as bnb
+        except ImportError:
+            raise ImportError(
+                "To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`."
+            )
+    if args.optimizer.lower() == "adamw":
+        optimizer_class = bnb.optim.AdamW8bit if args.use_8bit_adam else torch.optim.AdamW
+        optimizer = optimizer_class(
+            params_to_optimize,
+            betas=(args.adam_beta1, args.adam_beta2),
+            eps=args.adam_epsilon,
+            weight_decay=args.adam_weight_decay,
+        )
+    elif args.optimizer.lower() == "adam":
+        optimizer_class = bnb.optim.Adam8bit if args.use_8bit_adam else torch.optim.Adam
+        optimizer = optimizer_class(
+            params_to_optimize,
+            betas=(args.adam_beta1, args.adam_beta2),
+            eps=args.adam_epsilon,
+            weight_decay=args.adam_weight_decay,
+        )
+    elif args.optimizer.lower() == "prodigy":
+        try:
+            import prodigyopt
+        except ImportError:
+            raise ImportError("To use Prodigy, please install the prodigyopt library: `pip install prodigyopt`")
+        optimizer_class = prodigyopt.Prodigy
+        if args.learning_rate <= 0.1:
+            logger.warning(
+                "Learning rate is too low. When using prodigy, it's generally better to set learning rate around 1.0"
+            )
+        optimizer = optimizer_class(
+            params_to_optimize,
+            lr=args.learning_rate,
+            betas=(args.adam_beta1, args.adam_beta2),
+            beta3=args.prodigy_beta3,
+            weight_decay=args.adam_weight_decay,
+            eps=args.adam_epsilon,
+            decouple=args.prodigy_decouple,
+            use_bias_correction=args.prodigy_use_bias_correction,
+            safeguard_warmup=args.prodigy_safeguard_warmup,
+        )
+    return optimizer
+def prepare_rotary_positional_embeddings(
+    height: int,
+    width: int,
+    num_frames: int,
+    vae_scale_factor_spatial: int = 8,
+    patch_size: int = 2,
+    attention_head_dim: int = 64,
+    device: Optional[torch.device] = None,
+    base_height: int = 480,
+    base_width: int = 720,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    grid_height = height // (vae_scale_factor_spatial * patch_size)
+    grid_width = width // (vae_scale_factor_spatial * patch_size)
+    base_size_width = base_width // (vae_scale_factor_spatial * patch_size)
+    base_size_height = base_height // (vae_scale_factor_spatial * patch_size)
+    grid_crops_coords = get_resize_crop_region_for_grid((grid_height, grid_width), base_size_width, base_size_height)
+    freqs_cos, freqs_sin = get_3d_rotary_pos_embed(
+        embed_dim=attention_head_dim,
+        crops_coords=grid_crops_coords,
+        grid_size=(grid_height, grid_width),
+        temporal_size=num_frames,
+    )
+    freqs_cos = freqs_cos.to(device=device)
+    freqs_sin = freqs_sin.to(device=device)
+    return freqs_cos, freqs_sin
+def _get_t5_prompt_embeds(
+    tokenizer: T5Tokenizer,
+    text_encoder: T5EncoderModel,
+    prompt: Union[str, List[str]],
+    num_videos_per_prompt: int = 1,
+    max_sequence_length: int = 226,
+    device: Optional[torch.device] = None,
+    dtype: Optional[torch.dtype] = None,
+    text_input_ids=None,
+):
+    prompt = [prompt] if isinstance(prompt, str) else prompt
+    batch_size = len(prompt)
+    if tokenizer is not None:
+        text_inputs = tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=max_sequence_length,
+            truncation=True,
+            add_special_tokens=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+    else:
+        if text_input_ids is None:
+            raise ValueError("`text_input_ids` must be provided when the tokenizer is not specified.")
+    prompt_embeds = text_encoder(text_input_ids.to(device))[0]
+    prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+    # duplicate text embeddings for each generation per prompt, using mps friendly method
+    _, seq_len, _ = prompt_embeds.shape
+    prompt_embeds = prompt_embeds.repeat(1, num_videos_per_prompt, 1)
+    prompt_embeds = prompt_embeds.view(batch_size * num_videos_per_prompt, seq_len, -1)
+    return prompt_embeds
+def encode_prompt(
+    tokenizer: T5Tokenizer,
+    text_encoder: T5EncoderModel,
+    prompt: Union[str, List[str]],
+    num_videos_per_prompt: int = 1,
+    max_sequence_length: int = 226,
+    device: Optional[torch.device] = None,
+    dtype: Optional[torch.dtype] = None,
+    text_input_ids=None,
+):
+    prompt = [prompt] if isinstance(prompt, str) else prompt
+    prompt_embeds = _get_t5_prompt_embeds(
+        tokenizer,
+        text_encoder,
+        prompt=prompt,
+        num_videos_per_prompt=num_videos_per_prompt,
+        max_sequence_length=max_sequence_length,
+        device=device,
+        dtype=dtype,
+        text_input_ids=text_input_ids,
+    )
+    return prompt_embeds
+def compute_prompt_embeddings(
+    tokenizer, text_encoder, prompt, max_sequence_length, device, dtype, requires_grad: bool = False
+):
+    if requires_grad:
+        prompt_embeds = encode_prompt(
+            tokenizer,
+            text_encoder,
+            prompt,
+            num_videos_per_prompt=1,
+            max_sequence_length=max_sequence_length,
+            device=device,
+            dtype=dtype,
+        )
+    else:
+        with torch.no_grad():
+            prompt_embeds = encode_prompt(
+                tokenizer,
+                text_encoder,
+                prompt,
+                num_videos_per_prompt=1,
+                max_sequence_length=max_sequence_length,
+                device=device,
+                dtype=dtype,
+            )
+    return prompt_embeds
+def save_frames_as_pngs(video_array,output_dir,
+                        downsample_spatial=1,   # e.g. 2 to halve width & height
+                        downsample_temporal=1): # e.g. 2 to keep every 2nd frame
+    """
+    Save each frame of a (T, H, W, C) numpy array as a PNG with no compression.
+    """
+    assert video_array.ndim == 4 and video_array.shape[-1] == 3, \
+        "Expected (T, H, W, C=3) array"
+    assert video_array.dtype == np.uint8, "Expected uint8 array"
+    os.makedirs(output_dir, exist_ok=True)
+    # temporal downsample
+    frames = video_array[::downsample_temporal]
+    # compute spatially downsampled size
+    T, H, W, _ = frames.shape
+    new_size = (W // downsample_spatial, H // downsample_spatial)
+    # PNG compression param: 0 = no compression
+    png_params = [cv2.IMWRITE_PNG_COMPRESSION, 0]
+    for idx, frame in enumerate(frames):
+        # frame is RGB; convert to BGR for OpenCV
+        bgr = frame[..., ::-1]
+        if downsample_spatial > 1:
+            bgr = cv2.resize(bgr, new_size, interpolation=cv2.INTER_NEAREST)
+        filename = os.path.join(output_dir, "frame_{:05d}.png".format(idx))
+        success = cv2.imwrite(filename, bgr, png_params)
+        if not success:
+            raise RuntimeError("Failed to write frame ")