RaphaelLiu commited on Mar 28

Commit

759dfe0

verified ·

1 Parent(s): ce6b04d

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +3 -0
LICENSE +201 -0
README.md +78 -3
assets/grid.mp4 +3 -0
assets/mochi-factory.webp +3 -0
contrib/README.md +6 -0
contrib/modal/lora.yaml +58 -0
contrib/modal/main.py +285 -0
contrib/modal/readme.md +55 -0
decoder.safetensors +3 -0
demos/api_example.py +53 -0
demos/cli.py +163 -0
demos/comfyui_nodes.py +0 -0
demos/fine_tuner/README.md +99 -0
demos/fine_tuner/configs/lora.yaml +58 -0
demos/fine_tuner/dataset.py +45 -0
demos/fine_tuner/embed_captions.py +66 -0
demos/fine_tuner/encode_videos.py +142 -0
demos/fine_tuner/preprocess.bash +64 -0
demos/fine_tuner/run.bash +92 -0
demos/fine_tuner/train.py +396 -0
demos/fine_tuner/trim_and_crop_videos.py +110 -0
demos/gradio_ui.py +57 -0
demos/test_encoder_decoder.py +79 -0
encoder.safetensors +3 -0
model_index.json +24 -0
pusa_v0_dit.safetensors +3 -0
pyproject.toml +37 -0
pyrightconfig.json +4 -0
requirements.txt +14 -0
scheduler/scheduler_config.json +12 -0
scripts/download_weights.py +41 -0
scripts/format.bash +5 -0
scripts/pytorch_to_safe_tensors.py +24 -0
scripts/typecheck.bash +2 -0
scripts/weights_to_fp8.py +0 -0
src/genmo/lib/attn_imports.py +29 -0
src/genmo/lib/progress.py +87 -0
src/genmo/lib/utils.py +67 -0
src/genmo/mochi_preview/__init__.py +0 -0
src/genmo/mochi_preview/dit/joint_model/__init__.py +0 -0
src/genmo/mochi_preview/dit/joint_model/asymm_models_joint.py +737 -0
src/genmo/mochi_preview/dit/joint_model/context_parallel.py +158 -0
src/genmo/mochi_preview/dit/joint_model/layers.py +179 -0
src/genmo/mochi_preview/dit/joint_model/lora.py +112 -0
src/genmo/mochi_preview/dit/joint_model/mod_rmsnorm.py +15 -0
src/genmo/mochi_preview/dit/joint_model/residual_tanh_gated_rmsnorm.py +20 -0
src/genmo/mochi_preview/dit/joint_model/rope_mixed.py +88 -0
src/genmo/mochi_preview/dit/joint_model/temporal_rope.py +34 -0
src/genmo/mochi_preview/dit/joint_model/utils.py +109 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+assets/grid.gif filter=lfs diff=lfs merge=lfs -text
+assets/grid.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/mochi-factory.webp filter=lfs diff=lfs merge=lfs -text

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright 2024 Genmo
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.md CHANGED Viewed

@@ -1,3 +1,78 @@
----
-license: apache-2.0
----

+# Pusa VidGen
+[Codes](https://github.com/Yaofang-Liu/Pusa-VidGen) | [Hugging Face](https://huggingface.co/RaphaelLiu/Pusa-V0.5)
+## Overview
+Pusa is an advanced open-source video generation model that builds upon Mochi 1 with significant enhancements. It supports multiple video generation tasks while maintaining high-fidelity motion and strong prompt adherence. The model is released under a permissive Apache 2.0 license.
+✨ **Key Features**
+- **Multi-task support**: Text-to-Video, Image-to-Video, Interpolation, Transition, Loop, Long Video, and more
+- **Cost-efficient**: Trained with just 100 H100 GPU hours
+- **Full Open-Source**: Code, architecture, and training details included
+🔍 **Unique Architecture**
+- A novel diffusion model supporting frame-level noise with vectorized timesteps originally introduced in the [FVDM paper](https://arxiv.org/abs/2410.03160) for flexibility and scalability
+## Installation
+Install using [uv](https://github.com/astral-sh/uv):
+```bash
+git clone https://github.com/Yaofang-Liu/Pusa-VidGen
+cd models
+pip install uv
+uv venv .venv
+source .venv/bin/activate
+uv pip install setuptools
+uv pip install -e . --no-build-isolation
+```
+If you want to install flash attention, you can use:
+```
+uv pip install -e .[flash] --no-build-isolation
+```
+You will also need to install [FFMPEG](https://www.ffmpeg.org/) to turn your outputs into videos.
+## Download Weights
+You can use the Hugging Face CLI to download the model:
+```
+pip install huggingface_hub
+huggingface-cli download RaphaelLiu/Pusa-V0.5 --local-dir <path_to_downloaded_directory>
+```
+Or, directly download the weights from [Hugging Face](https://huggingface.co/RaphaelLiu/Pusa-V0.5) to a folder on your computer.
+## Limitations
+Pusa has a few known limitations. The base model Mochi generates videos at 480p. We expect to get better results when use our proposed method to more powerful models like Wan2.1. We also welcom collobartion from the community to improve the model and extend its capabilities.
+## Related Work
+- [mochi](https://huggingface.co/genmo/mochi-1-preview) is our base model, top 3 open-source video generation models in Artifical Analysis Leaderboard for video generation.
+- [FVDM](https://arxiv.org/abs/2410.03160) introduces the vectorized timestep approach that inspired Pusa's frame-level noise control.
+## BibTeX
+```
+@misc{Liu2025pusa,
+      title={Pusa: A Next-Level All-in-One Video Diffusion Model},
+      author={Yaofang Liu and Rui Liu},
+      year={2025},
+      publisher = {GitHub},
+      journal = {GitHub repository},
+      howpublished={\url{https://github.com/Yaofang-Liu/Pusa-VidGen}}
+}
+```
+```
+@article{liu2024redefining,
+  title={Redefining Temporal Modeling in Video Diffusion: The Vectorized Timestep Approach},
+  author={Liu, Yaofang and Ren, Yumeng and Cun, Xiaodong and Artola, Aitor and Liu, Yang and Zeng, Tieyong and Chan, Raymond H and Morel, Jean-michel},
+  journal={arXiv preprint arXiv:2410.03160},
+  year={2024}
+}
+```

assets/grid.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e10304cf30b68f92b82438a33d4dd1dade8169e142fee538ceebb9d91565604d
+size 6905424

assets/mochi-factory.webp ADDED Viewed

Git LFS Details

SHA256: dd70d39a9a26d7e69c9264caa947da0ae5c3695c384529ce469ecd1703abd165
Pointer size: 131 Bytes
Size of remote file: 560 kB

contrib/README.md ADDED Viewed

	@@ -0,0 +1,6 @@

+# Mochi Community Contributions
+`mochi/contrib` contains community contributed pipelines for running and customizing Mochi.
+## Index:
+ - `mochi/contrib/modal` - [Script](contrib/modal/readme.md) for fine-tuning Mochi on Modal GPUs.

contrib/modal/lora.yaml ADDED Viewed

	@@ -0,0 +1,58 @@

+init_checkpoint_path: /weights/dit.safetensors
+checkpoint_dir: /finetunes/my_mochi_lora
+train_data_dir: /videos_prepared
+attention_mode: sdpa
+single_video_mode: false # Useful for debugging whether your model can learn a single video
+# You only need this if you're using wandb
+wandb:
+  # project: mochi_1_lora
+  # name: ${checkpoint_dir}
+  # group: null
+optimizer:
+  lr: 2e-4
+  weight_decay: 0.01
+model:
+  type: lora
+  kwargs:
+    # Apply LoRA to the QKV projection and the output projection of the attention block.
+    qkv_proj_lora_rank: 16
+    qkv_proj_lora_alpha: 16
+    qkv_proj_lora_dropout: 0.
+    out_proj_lora_rank: 16
+    out_proj_lora_alpha: 16
+    out_proj_lora_dropout: 0.
+training:
+  model_dtype: bf16
+  warmup_steps: 200
+  num_qkv_checkpoint: 48
+  num_ff_checkpoint: 48
+  num_post_attn_checkpoint: 48
+  num_steps: 2000
+  save_interval: 200
+  caption_dropout: 0.1
+  grad_clip: 0.0
+  save_safetensors: true
+# Used for generating samples during training to monitor progress ...
+sample:
+   interval: 200
+   output_dir: ${checkpoint_dir}/samples
+   decoder_path: /weights/decoder.safetensors
+   prompts:
+      - A pristine snowglobe featuring a winter scene sits peacefully. The glass begins to crumble into fine powder, as the entire sphere deteriorates into sparkling dust that drifts outward. The fake snow mingles with the crystalline particles, creating a glittering cloud captured in high-speed photography.
+      - A vintage pocket watch ticks quietly on an antique desk. Its brass casing starts to deteriorate, turning to fine metallic powder that lifts into the air. The gears and springs fragment into microscopic particles, each piece breaking down into a shimmering bronze dust that hangs suspended. The scene is richly detailed with warm, brass tones.
+      - A cello is propped up against a wall, a single spotlight illuminating it. The wooden surface begins to decay into fine sawdust, the instrument gradually breaking apart as its form disintegrates into a cloud of earthen particles. The strings unravel into delicate fibers that float amidst the swirling wooden dust. The scene is vibrant and colorful.
+      - A graphics card sits inside an oven, heatwaves around it. The silicon and metal components begin to break down at a molecular level, deteriorating into a dark cloud of fine metallic and mineral dust that hangs suspended in the heated air. The scene is darkly lit, high contrast, with a focus on the suspended particles.
+      - A delicate porcelain teacup sits on a marble countertop. The ceramic structure begins to crumble into a fine, chalk-like powder, breaking down into countless microscopic white particles that drift upward in graceful patterns. The scene is bright and crisp with dramatic lighting illuminating the cloud of porcelain dust.
+   seed: 12345
+   kwargs:
+     height: 480
+     width: 848
+     num_frames: 37
+     num_inference_steps: 64
+     sigma_schedule_python_code: "linear_quadratic_schedule(64, 0.025)"
+     cfg_schedule_python_code: "[6.0] * 64"

contrib/modal/main.py ADDED Viewed

	@@ -0,0 +1,285 @@

+import modal
+from pathlib import Path
+# Creating our Modal App
+app = modal.App("mochi-finetune")
+# Creating volumes for data, intermediate data, and produced weights
+videos_volume = modal.Volume.from_name("mochi-tune-videos", create_if_missing=True)
+videos_prepared_volume = modal.Volume.from_name("mochi-tune-videos-prepared", create_if_missing=True)
+weights_volume = modal.Volume.from_name("mochi-tune-weights", create_if_missing=True)
+finetunes_volume = modal.Volume.from_name("mochi-tune-finetunes", create_if_missing=True)
+outputs_volume = modal.Volume.from_name("mochi-tune-outputs", create_if_missing=True)
+USERNAME = "genmoai"
+REPOSITORY = "mochi"
+CLONE_CMD = f"git clone https://github.com/{USERNAME}/{REPOSITORY}.git"
+# Building our container image
+base_img = (
+    modal.Image.debian_slim()
+    .apt_install("git", "ffmpeg", "bc", "zlib1g-dev", "libjpeg-dev", "wget")
+    .run_commands(CLONE_CMD)
+    .workdir(REPOSITORY)
+    .pip_install("gdown", "setuptools", "wheel")
+    .run_commands('pip install -e . --no-build-isolation')
+)
+MINUTES = 60
+HOURS = 60 * MINUTES
+# Remote function for downloading a labeled video dataset from Google Drive
+# Run it with:
+#   modal run main::download_videos
+@app.function(image=base_img,
+    volumes={
+        "/videos": videos_volume,
+    }
+)
+def download_videos():
+    '''Downloads videos from google drive into our volume'''
+    import gdown
+    import zipfile
+    name = "dissolve"
+    url = "https://drive.google.com/uc?id=1ldoBppcsv5Ueoikh0zCmNviojRCrGXQN"
+    output = f"{name}.zip"
+    gdown.download(url, output, quiet=False)
+    with zipfile.ZipFile(output, "r") as zip_ref:
+        zip_ref.extractall("/videos")
+# Remote function for downloading the model weights from Hugging Face
+# Run it with:
+#   modal run main::download_weights
+@app.function(image=base_img,
+    volumes={
+        "/weights": weights_volume,
+    },
+    timeout=1*HOURS,
+)
+def download_weights():
+    # HF-transfer and snapshot download tend to hang on the large model, so we download it manually with wget
+    import subprocess
+    print("🍡 Downloading weights from Hugging Face. This may take 30 minutes.")
+    # ~30 min
+    subprocess.run(["wget", "https://huggingface.co/genmo/mochi-1-preview/resolve/main/dit.safetensors", "-O", "/weights/dit.safetensors"])
+    # ~1 min
+    subprocess.run(["wget", "https://huggingface.co/genmo/mochi-1-preview/resolve/main/decoder.safetensors", "-O", "/weights/decoder.safetensors"])
+    # ~20 sec
+    subprocess.run(["wget", "https://huggingface.co/genmo/mochi-1-preview/resolve/main/encoder.safetensors", "-O", "/weights/encoder.safetensors"])
+# Remote function for preprocessing the video dataset
+# Run it with:
+#   modal run main::preprocess
+@app.function(
+    image=base_img,
+    volumes={
+        "/videos": videos_volume,
+        "/videos_prepared": videos_prepared_volume,
+        "/weights": weights_volume,
+    },
+    timeout=30*MINUTES,
+    gpu="H100"
+)
+def preprocess():
+    import subprocess
+    print("🍡 Preprocessing videos. This may take 2-3 minutes.")
+    video_dir = "videos_dissolve"
+    subprocess.run([
+        "bash", "demos/fine_tuner/preprocess.bash",
+        "-v", f"/videos/{video_dir}/",
+        "-o", "/videos_prepared/",
+        "-w", "/weights/",
+        "-n", "37"
+    ])
+# Remote function for finetuning the model using the prepared dataset
+# Configure the run in lora.yaml
+# Run it with:
+#   modal run main::finetune
+@app.function(
+    image=base_img,
+    volumes={
+        "/videos": videos_volume,
+        "/videos_prepared": videos_prepared_volume,
+        "/weights": weights_volume,
+        "/finetunes": finetunes_volume,
+    },
+    mounts=[modal.Mount.from_local_file("lora.yaml", remote_path=f"{REPOSITORY}/lora.yaml")],
+    timeout=4*HOURS,
+    gpu="H100"
+)
+def finetune():
+    import subprocess
+    print("🍡 Finetuning Mochi. This may take 3 hours.")
+    print("🍡 See your mochi-tune-finetunes volume for intermediate checkpoints and samples.")
+    subprocess.run([
+        "bash", "demos/fine_tuner/run.bash",
+        "-c", "lora.yaml", # from our locally mounted yaml file
+        "-n", "1",
+    ])
+# Remote function (Modal @cls) for running inference on one or multiple videos
+# Run it with the @local_entrypoint below
+@app.cls(
+    image = base_img,
+    volumes={
+        "/weights": weights_volume,
+        "/finetunes": finetunes_volume,
+        "/outputs": outputs_volume,
+    },
+    timeout=30*MINUTES,
+    gpu="H100"
+)
+class MochiLora():
+    def __init__(self, model_dir: str = "/weights", lora_path: str = None, cpu_offload: bool = False):
+        self.model_dir = model_dir
+        self.lora_path = lora_path
+        self.cpu_offload = cpu_offload
+    @modal.enter()
+    def start(self):
+        from genmo.mochi_preview.pipelines import (
+            DecoderModelFactory,
+            DitModelFactory,
+            MochiMultiGPUPipeline,
+            MochiSingleGPUPipeline,
+            T5ModelFactory,
+        )
+        import torch
+        """Initialize the model - this runs once when the container starts"""
+        print("🍡 Loading Mochi model.")
+        self.num_gpus = torch.cuda.device_count()
+        # Configure pipeline based on GPU count
+        klass = MochiSingleGPUPipeline if self.num_gpus == 1 else MochiMultiGPUPipeline
+        kwargs = dict(
+            text_encoder_factory=T5ModelFactory(),
+            dit_factory=DitModelFactory(
+                model_path=f"{self.model_dir}/dit.safetensors",
+                lora_path=self.lora_path,
+                model_dtype="bf16",
+            ),
+            decoder_factory=DecoderModelFactory(
+                model_path=f"{self.model_dir}/decoder.safetensors",
+            ),
+        )
+        if self.num_gpus > 1:
+            assert not self.lora_path, f"Lora not supported in multi-GPU mode"
+            assert not self.cpu_offload, "CPU offload not supported in multi-GPU mode"
+            kwargs["world_size"] = self.num_gpus
+        else:
+            kwargs["cpu_offload"] = self.cpu_offload
+            kwargs["decode_type"] = "tiled_spatial"
+            kwargs["fast_init"] = not self.lora_path
+            kwargs["strict_load"] = not self.lora_path
+            kwargs["decode_args"] = dict(overlap=8)
+        self.pipeline = klass(**kwargs)
+        print(f"🍡 Model loaded successfully with {self.num_gpus} GPUs")
+    @modal.method()
+    def generate(self,
+                prompt: str,
+                negative_prompt: str = "",
+                width: int = 848,
+                height: int = 480,
+                num_frames: int = 163,
+                seed: int = 1710977262,
+                cfg_scale: float = 6.0,
+                num_inference_steps: int = 64) -> str:
+        """Generate video based on the prompt and parameters"""
+        print("🍡 Generating video.")
+        import json
+        import os
+        import time
+        import numpy as np
+        from genmo.lib.progress import progress_bar
+        from genmo.lib.utils import save_video
+        from genmo.mochi_preview.pipelines import linear_quadratic_schedule
+        # Create sigma schedule
+        sigma_schedule = linear_quadratic_schedule(num_inference_steps, 0.025)
+        cfg_schedule = [cfg_scale] * num_inference_steps
+        args = {
+            "height": height,
+            "width": width,
+            "num_frames": num_frames,
+            "sigma_schedule": sigma_schedule,
+            "cfg_schedule": cfg_schedule,
+            "num_inference_steps": num_inference_steps,
+            "batch_cfg": False,
+            "prompt": prompt,
+            "negative_prompt": negative_prompt,
+            "seed": seed,
+        }
+        with progress_bar(type="tqdm"):
+            final_frames = self.pipeline(**args)
+            final_frames = final_frames[0]
+            assert isinstance(final_frames, np.ndarray)
+            assert final_frames.dtype == np.float32
+            # Save to mounted volume
+            output_dir = "/outputs"  # Assuming this path exists in the mounted volume
+            os.makedirs(output_dir, exist_ok=True)
+            output_path = os.path.join(output_dir, f"output_{int(time.time())}.mp4")
+            save_video(final_frames, output_path)
+            # Save generation parameters
+            json_path = os.path.splitext(output_path)[0] + ".json"
+            json.dump(args, open(json_path, "w"), indent=4)
+        print(f"🍡 Video saved to {output_path}")
+        outputs_volume.commit()
+        return output_path.split("/")[-1]
+# Local entrypoint for using the MochiLora class
+# Select the lora_path you'd want to use from the finetunes volume
+# Then it with:
+#   modal run main
+@app.local_entrypoint()
+def main(
+    prompt="A pristine snowglobe featuring a winter scene sits peacefully. The glass begins to crumble into fine powder, as the entire sphere deteriorates into sparkling dust that drifts outward. The fake snow mingles with the crystalline particles, creating a glittering cloud captured in high-speed photography.",
+    negative_prompt="blurry, low quality",
+    width=848,
+    height=480,
+    num_frames=49, # (num_frames - 1) must be divisible by 6
+    seed=1710977262,
+    cfg_scale=6.0,
+    num_inference_steps=64,
+    lora_path="/finetunes/my_mochi_lora/model_2000.lora.safetensors",
+    cpu_offload=True,
+):
+    lora = MochiLora(
+        lora_path=lora_path, # your lora path
+        cpu_offload=cpu_offload,
+    )
+    output_path = lora.generate.remote(
+        prompt=prompt,
+        negative_prompt=negative_prompt,
+        width=width,
+        height=height,
+        num_frames=num_frames,
+        seed=seed,
+        cfg_scale=cfg_scale,
+        num_inference_steps=num_inference_steps,
+    )
+    local_dir = Path("/tmp/mochi")
+    local_dir.mkdir(exist_ok=True, parents=True)
+    local_path = local_dir / output_path
+    local_path.write_bytes(b"".join(outputs_volume.read_file(output_path)))
+    print(f"🍡 video saved locally at {local_path}")

contrib/modal/readme.md ADDED Viewed

	@@ -0,0 +1,55 @@

+## Finetuning Mochi with LoRA on Modal
+This example demonstrates how to run the Mochi finetuner on Modal GPUs.
+### Setup
+Install [Modal](https://modal.com/docs/guide).
+```bash
+pip install modal
+modal setup
+```
+### Fetch the dataset
+There is a labeled dataset for a dissolving visual effect available on Google Drive. Download it into the `mochi-tune-videos` modal volume with:
+```bash
+modal run main::download_videos
+```
+### Download the model weights
+Download the model weights from Hugging Face into the `mochi-tune-weights` modal volume with:
+```bash
+modal run -d main::download_weights
+```
+Note that this download can take more than 30 minutes. The `-d` flag allows you to exit the terminal session without losing progress.
+### Prepare the dataset
+We now run the preprocessing script to prepare the dataset for finetuning:
+```bash
+modal run main::preprocess
+```
+This puts preprocessed training input into the `mochi-tune-videos-prepared` modal volume.
+### Finetuning
+Finetune the model using the prepared dataset.
+You may configure the finetune run using the `lora.yaml` file, such as number of steps, learning rate, etc.
+Run the finetuning with:
+```bash
+modal run -d main::finetune
+```
+This will produce a series of checkpoints, as well as video samples generated along the training process. You can view these files in the Modal `moshi-tune-finetunes` volume using the Storage tab in the dashboard.
+### Inference
+You can now use the MochiLora class to generate videos from a prompt. The `main` entrypoint will initialize the model to use the specified LoRA weights from your finetuning run.
+```bash
+modal run main
+```
+or with more parameters:
+```bash
+modal run main lora-path="/finetunes/my_mochi_lora/model_1000.lora.safetensors" prompt="A pristine snowglobe featuring a winter scene sits peacefully. The glass begins to crumble into fine powder, as the entire sphere deteriorates into sparkling dust that drifts outward."
+```
+See modal run main --help for all inference options.

decoder.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:641920faaf20e5404ddb5553ce3e295c21ed9b4bc5f6fe7c930811b84099cb14
+size 1450122828

demos/api_example.py ADDED Viewed

	@@ -0,0 +1,53 @@

+#! /usr/bin/env python
+import sys
+from pathlib import Path
+from textwrap import dedent
+from genmo.lib.progress import progress_bar
+from genmo.lib.utils import save_video
+from genmo.mochi_preview.pipelines import (
+    DecoderModelFactory,
+    DitModelFactory,
+    MochiSingleGPUPipeline,
+    T5ModelFactory,
+    linear_quadratic_schedule,
+)
+MOCHI_DIR = sys.argv[1]
+assert Path(MOCHI_DIR).exists(), f"Model directory {MOCHI_DIR} does not exist."
+pipeline = MochiSingleGPUPipeline(
+    text_encoder_factory=T5ModelFactory(),
+    dit_factory=DitModelFactory(model_path=f"{MOCHI_DIR}/dit.safetensors", model_dtype="bf16"),
+    decoder_factory=DecoderModelFactory(
+        model_path=f"{MOCHI_DIR}/vae.safetensors",
+        model_stats_path=f"{MOCHI_DIR}/vae_stats.json",
+    ),
+    cpu_offload=True,
+    decode_type="tiled_full",
+)
+PROMPT = dedent("""
+A hand with delicate fingers picks up a bright yellow lemon from a wooden bowl
+filled with lemons and sprigs of mint against a peach-colored background.
+The hand gently tosses the lemon up and catches it, showcasing its smooth texture.
+A beige string bag sits beside the bowl, adding a rustic touch to the scene.
+Additional lemons, one halved, are scattered around the base of the bowl.
+The even lighting enhances the vibrant colors and creates a fresh,
+inviting atmosphere.
+""")
+video = pipeline(
+    height=480,
+    width=848,
+    num_frames=31,
+    num_inference_steps=64,
+    sigma_schedule=linear_quadratic_schedule(64, 0.025),
+    cfg_schedule=[4.5] * 64,
+    batch_cfg=False,
+    prompt=PROMPT,
+    negative_prompt="",
+    seed=12345,
+)
+with progress_bar(type="tqdm"):
+    save_video(video[0], "video.mp4")

demos/cli.py ADDED Viewed

	@@ -0,0 +1,163 @@

+#! /usr/bin/env python
+import json
+import os
+import time
+import click
+import numpy as np
+import torch
+from genmo.lib.progress import progress_bar
+from genmo.lib.utils import save_video
+from genmo.mochi_preview.pipelines import (
+    DecoderModelFactory,
+    DitModelFactory,
+    MochiMultiGPUPipeline,
+    MochiSingleGPUPipeline,
+    T5ModelFactory,
+    linear_quadratic_schedule,
+)
+pipeline = None
+model_dir_path = None
+lora_path = None
+num_gpus = torch.cuda.device_count()
+cpu_offload = False
+def configure_model(model_dir_path_, lora_path_, cpu_offload_):
+    global model_dir_path, lora_path, cpu_offload
+    model_dir_path = model_dir_path_
+    lora_path = lora_path_
+    cpu_offload = cpu_offload_
+def load_model():
+    global num_gpus, pipeline, model_dir_path, lora_path
+    if pipeline is None:
+        MOCHI_DIR = model_dir_path
+        print(f"Launching with {num_gpus} GPUs. If you want to force single GPU mode use CUDA_VISIBLE_DEVICES=0.")
+        klass = MochiSingleGPUPipeline if num_gpus == 1 else MochiMultiGPUPipeline
+        kwargs = dict(
+            text_encoder_factory=T5ModelFactory(),
+            dit_factory=DitModelFactory(
+                model_path=f"{MOCHI_DIR}/dit.safetensors",
+                lora_path=lora_path,
+                model_dtype="bf16",
+            ),
+            decoder_factory=DecoderModelFactory(
+                model_path=f"{MOCHI_DIR}/decoder.safetensors",
+            ),
+        )
+        if num_gpus > 1:
+            assert not lora_path, f"Lora not supported in multi-GPU mode"
+            assert not cpu_offload, "CPU offload not supported in multi-GPU mode"
+            kwargs["world_size"] = num_gpus
+        else:
+            kwargs["cpu_offload"] = cpu_offload
+            kwargs["decode_type"] = "tiled_spatial"
+            kwargs["fast_init"] = not lora_path
+            kwargs["strict_load"] = not lora_path
+            kwargs["decode_args"] = dict(overlap=8)
+        pipeline = klass(**kwargs)
+def generate_video(
+    prompt,
+    negative_prompt,
+    width,
+    height,
+    num_frames,
+    seed,
+    cfg_scale,
+    num_inference_steps,
+):
+    load_model()
+    # sigma_schedule should be a list of floats of length (num_inference_steps + 1),
+    # such that sigma_schedule[0] == 1.0 and sigma_schedule[-1] == 0.0 and monotonically decreasing.
+    sigma_schedule = linear_quadratic_schedule(num_inference_steps, 0.025)
+    # cfg_schedule should be a list of floats of length num_inference_steps.
+    # For simplicity, we just use the same cfg scale at all timesteps,
+    # but more optimal schedules may use varying cfg, e.g:
+    # [5.0] * (num_inference_steps // 2) + [4.5] * (num_inference_steps // 2)
+    cfg_schedule = [cfg_scale] * num_inference_steps
+    args = {
+        "height": height,
+        "width": width,
+        "num_frames": num_frames,
+        "sigma_schedule": sigma_schedule,
+        "cfg_schedule": cfg_schedule,
+        "num_inference_steps": num_inference_steps,
+        # We *need* flash attention to batch cfg
+        # and it's only worth doing in a high-memory regime (assume multiple GPUs)
+        "batch_cfg": False,
+        "prompt": prompt,
+        "negative_prompt": negative_prompt,
+        "seed": seed,
+    }
+    with progress_bar(type="tqdm"):
+        final_frames = pipeline(**args)
+        final_frames = final_frames[0]
+        assert isinstance(final_frames, np.ndarray)
+        assert final_frames.dtype == np.float32
+        os.makedirs("outputs", exist_ok=True)
+        output_path = os.path.join("outputs", f"output_{int(time.time())}.mp4")
+        save_video(final_frames, output_path)
+        json_path = os.path.splitext(output_path)[0] + ".json"
+        json.dump(args, open(json_path, "w"), indent=4)
+        return output_path
+from textwrap import dedent
+DEFAULT_PROMPT = dedent("""
+A hand with delicate fingers picks up a bright yellow lemon from a wooden bowl
+filled with lemons and sprigs of mint against a peach-colored background.
+The hand gently tosses the lemon up and catches it, showcasing its smooth texture.
+A beige string bag sits beside the bowl, adding a rustic touch to the scene.
+Additional lemons, one halved, are scattered around the base of the bowl.
+The even lighting enhances the vibrant colors and creates a fresh,
+inviting atmosphere.
+""")
+@click.command()
+@click.option("--prompt", default=DEFAULT_PROMPT, help="Prompt for video generation.")
+@click.option("--negative_prompt", default="", help="Negative prompt for video generation.")
+@click.option("--width", default=848, type=int, help="Width of the video.")
+@click.option("--height", default=480, type=int, help="Height of the video.")
+@click.option("--num_frames", default=163, type=int, help="Number of frames.")
+@click.option("--seed", default=1710977262, type=int, help="Random seed.")
+@click.option("--cfg_scale", default=6.0, type=float, help="CFG Scale.")
+@click.option("--num_steps", default=64, type=int, help="Number of inference steps.")
+@click.option("--model_dir", required=True, help="Path to the model directory.")
+@click.option("--lora_path", required=False, help="Path to the lora file.")
+@click.option("--cpu_offload", is_flag=True, help="Whether to offload model to CPU")
+def generate_cli(
+    prompt, negative_prompt, width, height, num_frames, seed, cfg_scale, num_steps, model_dir, lora_path, cpu_offload
+):
+    configure_model(model_dir, lora_path, cpu_offload)
+    output = generate_video(
+        prompt,
+        negative_prompt,
+        width,
+        height,
+        num_frames,
+        seed,
+        cfg_scale,
+        num_steps,
+    )
+    click.echo(f"Video generated at: {output}")
+if __name__ == "__main__":
+    generate_cli()

demos/comfyui_nodes.py ADDED Viewed

File without changes

demos/fine_tuner/README.md ADDED Viewed

	@@ -0,0 +1,99 @@

+# Mochi 1 LoRA Fine-tuner
+![Mochi being made](../../assets/mochi-factory.webp)
+This folder contains tools for fine-tuning the Mochi 1 model. It supports [LoRA](https://arxiv.org/abs/2106.09685) fine-tuning on a single GPU.
+## Quick Start (Single GPU)
+This shows you how to prepare your dataset for single GPU.
+First, setup the inference code and download Mochi 1 weights following [README.md](../../README.md).
+All commands below assume you are in the top-level directory of the Mochi repo.
+### 1. Collect your videos and captions
+Collect your videos (supported formats: MP4, MOV) into a folder, e.g. `videos/`. Then, write a detailed description of each of the videos in a txt file with the same name. For example,
+```
+videos/
+  video_1.mp4
+  video_1.txt -- One-paragraph description of video_1
+  video_2.mp4
+  video_2.txt -- One-paragraph description of video_2
+  ...
+```
+### 2. Process videos and captions (About 2 minutes)
+Update the paths in the command below to match your dataset. Videos are processed at 30 FPS, so make sure your videos are at least `num_frames / 30` seconds long.
+```bash
+bash demos/fine_tuner/preprocess.bash -v videos/ -o videos_prepared/ -w weights/ --num_frames 37
+```
+### 3. Fine-tune the model
+Update `./demos/fine_tuner/configs/lora.yaml` to customize the fine-tuning process,
+including prompts to generate at various points of the fine-tuning process and the path to your prepared videos.
+Launch LoRA fine-tuning on single GPU:
+```bash
+bash ./demos/fine_tuner/run.bash -c ./demos/fine_tuner/configs/lora.yaml -n 1
+```
+Samples will be generated in `finetunes/my_mochi_lora/samples` every 200 steps.
+### 4. Use your fine-tuned weights to generate videos!
+Update `--lora_path` to the path of your fine-tuned weights and run:
+```python
+python3 ./demos/cli.py --model_dir weights/ --lora_path finetunes/my_mochi_lora/model_2000.lora.safetensors --num_frames 37 --cpu_offload --prompt "A delicate porcelain teacup sits on a marble countertop. The teacup suddenly shatters into hundreds of white ceramic shards that scatter through the air. The scene is bright and crisp with dramatic lighting."
+```
+You can increase the number of frames to generate a longer video. Finally, share your creations with the community by uploading your LoRA and sample videos to Hugging Face.
+## System Requirements
+**Single GPU:**
+- 1x H100 or A100 (80 GB VRAM is recommended)
+- Less VRAM is required if training with less than 1 second long videos.
+**Supported video lengths:** Up to 85 frames (~2.8 seconds at 30 FPS)
+- Choose a frame count in increments of 6: 25, 31, 37, ... 79, 85.
+- Training on 37 frames uses 50 GB of VRAM. On 1 H100, each training step takes about 1.67 s/it,
+  and you'll start seeing changes to your videos within 200-400 steps. Training for 1,000 steps takes about 30 minutes.
+Settings tested on 1x H100 SXM:
+| Frames | Video Length | VRAM | Time/step | num_qkv_checkpoint | num_ff_checkpoint | num_post_attn_checkpoint |
+|--------|--------------|------|-----------|-------------------|-------------------|-------------------------|
+| 37 frames | 1.2 second videos | 50 GB VRAM | 1.67 s/it | 48 | 48† | 48 |
+| 61 frames | 2.0 second videos | 64 GB VRAM | 3.35 s/it | 48 | 48† | 48 |
+| 79 frames | 2.6 second videos | 69-78 GB VRAM | 4.92 s/it | 48 | 48† | 48 |
+| 85 frames | 2.8 second videos | 80 GB VRAM | 5.44 s/it | 48 | 48 | 48 |
+*† As the VRAM is not fully used, you can lower `num_ff_checkpoint` to speed up training.*
+## Technical Details
+- LoRA fine-tuning updates the query, key, and value projection matrices, as well as the output projection matrix.
+  These settings are configurable in `./demos/fine_tuner/configs/lora.yaml`.
+- We welcome contributions and suggestions for improved settings.
+## Known Limitations
+- No support for training on multiple GPUs
+- LoRA inference is restricted to 1-GPU (for now)
+## Tips
+- Be as descriptive as possible in your captions.
+- A learning rate around 1e-4 or 2e-4 seems effective for LoRA fine-tuning.
+- For larger datasets or to customize the model aggressively, increase `num_steps` in in the YAML.
+- To monitor training loss, uncomment the `wandb` section in the YAML and run `wandb login` or set the `WANDB_API_KEY` environment variable.
+- Videos are trimmed to the **first** `num_frames` frames. Make sure your clips contain the content you care about near the beginning.
+  You can check the trimmed versions after running `preprocess.bash` to make sure they look good.
+- When capturing HDR videos on an iPhone, convert your .mov files to .mp4 using the Handbrake application. Our preprocessing script won't produce the correct colorspace otherwise, and your fine-tuned videos may look overly bright.
+### If you are running out of GPU memory, make sure:
+- `COMPILE_DIT=1` is set in `demos/fine_tuner/run.bash`.
+  This enables model compilation, which saves memory and speeds up training!
+- `num_post_attn_checkpoint`, `num_ff_checkpoint`, and `num_qkv_checkpoint` are set to 48 in your YAML.
+  You can checkpoint up to 48 layers, saving memory at the cost of slower training.
+- If all else fails, reduce `num_frames` when processing your videos and in your YAML.
+  You can fine-tune Mochi on shorter videos, and still generate longer videos at inference time.

demos/fine_tuner/configs/lora.yaml ADDED Viewed

	@@ -0,0 +1,58 @@

+init_checkpoint_path: weights/dit.safetensors
+checkpoint_dir: finetunes/my_mochi_lora
+train_data_dir: videos_prepared
+attention_mode: sdpa
+single_video_mode: false # Useful for debugging whether your model can learn a single video
+# You only need this if you're using wandb
+wandb:
+  # project: mochi_1_lora
+  # name: ${checkpoint_dir}
+  # group: null
+optimizer:
+  lr: 2e-4
+  weight_decay: 0.01
+model:
+  type: lora
+  kwargs:
+    # Apply LoRA to the QKV projection and the output projection of the attention block.
+    qkv_proj_lora_rank: 16
+    qkv_proj_lora_alpha: 16
+    qkv_proj_lora_dropout: 0.
+    out_proj_lora_rank: 16
+    out_proj_lora_alpha: 16
+    out_proj_lora_dropout: 0.
+training:
+  model_dtype: bf16
+  warmup_steps: 200
+  num_qkv_checkpoint: 48
+  num_ff_checkpoint: 48
+  num_post_attn_checkpoint: 48
+  num_steps: 2000
+  save_interval: 200
+  caption_dropout: 0.1
+  grad_clip: 0.0
+  save_safetensors: true
+# Used for generating samples during training to monitor progress ...
+sample:
+   interval: 200
+   output_dir: ${checkpoint_dir}/samples
+   decoder_path: weights/decoder.safetensors
+   prompts:
+       - A pristine snowglobe featuring a winter scene sits peacefully. The globe violently explodes, sending glass, water, and glittering fake snow in all directions. The scene is captured with high-speed photography.
+       - A vintage pocket watch ticks quietly on an antique desk. Suddenly, it explodes into gears, springs and metal fragments that scatter through the air. The scene is richly detailed with warm, brass tones.
+       - A cello is propped up against a wall, a single spotlight illuminating it.  The cello explodes into wooden fragments, sending debris everywhere.  The scene is vibrant and colorful.
+       - A graphics card sits inside an oven, heatwaves around it.  Suddenly, the graphics card explodes into numerous fragments, sending debris everywhere.  The scene is darkly lit, high contrast, with a focus on the shattered pieces.
+       - A delicate porcelain teacup sits on a marble countertop. The teacup suddenly shatters into hundreds of white ceramic shards that scatter through the air. The scene is bright and crisp with dramatic lighting.
+   seed: 12345
+   kwargs:
+     height: 480
+     width: 848
+     num_frames: 37
+     num_inference_steps: 64
+     sigma_schedule_python_code: "linear_quadratic_schedule(64, 0.025)"
+     cfg_schedule_python_code: "[6.0] * 64"

demos/fine_tuner/dataset.py ADDED Viewed

	@@ -0,0 +1,45 @@

+from pathlib import Path
+import click
+import torch
+from torch.utils.data import DataLoader, Dataset
+def load_to_cpu(x):
+    return torch.load(x, map_location=torch.device("cpu"), weights_only=True)
+class LatentEmbedDataset(Dataset):
+    def __init__(self, file_paths, repeat=1):
+        self.items = [
+            (Path(p).with_suffix(".latent.pt"), Path(p).with_suffix(".embed.pt"))
+            for p in file_paths
+            if Path(p).with_suffix(".latent.pt").is_file() and Path(p).with_suffix(".embed.pt").is_file()
+        ]
+        self.items = self.items * repeat
+        print(f"Loaded {len(self.items)}/{len(file_paths)} valid file pairs.")
+    def __len__(self):
+        return len(self.items)
+    def __getitem__(self, idx):
+        latent_path, embed_path = self.items[idx]
+        return load_to_cpu(latent_path), load_to_cpu(embed_path)
+@click.command()
+@click.argument("directory", type=click.Path(exists=True, file_okay=False))
+def process_videos(directory):
+    dir_path = Path(directory)
+    mp4_files = [str(f) for f in dir_path.glob("**/*.mp4") if not f.name.endswith(".recon.mp4")]
+    assert mp4_files, f"No mp4 files found"
+    dataset = LatentEmbedDataset(mp4_files)
+    dataloader = DataLoader(dataset, batch_size=4, shuffle=True)
+    for latents, embeds in dataloader:
+        print([(k, v.shape) for k, v in latents.items()])
+if __name__ == "__main__":
+    process_videos()

demos/fine_tuner/embed_captions.py ADDED Viewed

	@@ -0,0 +1,66 @@

+#! /usr/bin/env python3
+from pathlib import Path
+import click
+import torch
+from tqdm import tqdm
+from transformers import T5Tokenizer
+from genmo.mochi_preview.pipelines import T5_MODEL, T5ModelFactory, get_conditioning_for_prompts
+@click.command()
+@click.argument("captions_dir", type=click.Path(exists=True, file_okay=False, dir_okay=True, path_type=Path))
+@click.option("--device_id", default=0, help="GPU device ID to use")
+@click.option("--overwrite", "-ow", is_flag=True, help="Overwrite existing embeddings")
+def process_captions(captions_dir: Path, device_id: int, overwrite=True) -> None:
+    """Process all text files in a directory using T5 encoder.
+    Args:
+        captions_dir: Directory containing input text files
+        device_id: GPU device ID to use
+    """
+    torch.backends.cuda.matmul.allow_tf32 = True
+    torch.backends.cudnn.allow_tf32 = True
+    # Get all text file paths
+    text_paths = list(captions_dir.glob("**/*.txt"))
+    if not text_paths:
+        print(f"No text files found in {captions_dir}")
+        return
+    # Initialize model and tokenizer
+    model_factory = T5ModelFactory()
+    device = f"cuda:{device_id}"
+    model = model_factory.get_model(local_rank=0, device_id=device_id, world_size=1)
+    tokenizer = T5Tokenizer.from_pretrained(T5_MODEL, legacy=False)
+    with tqdm(total=len(text_paths)) as pbar:
+        for text_path in text_paths:
+            embed_path = text_path.with_suffix(".embed.pt")
+            if embed_path.exists() and not overwrite:
+                pbar.write(f"Skipping {text_path} - embeddings already exist")
+                continue
+            pbar.write(f"Processing {text_path}")
+            try:
+                with open(text_path) as f:
+                    text = f.read().strip()
+                with torch.inference_mode():
+                    conditioning = get_conditioning_for_prompts(tokenizer, model, device, [text])
+                torch.save(conditioning, embed_path)
+            except Exception as e:
+                import traceback
+                traceback.print_exc()
+                pbar.write(f"Error processing {text_path}: {str(e)}")
+            pbar.update(1)
+if __name__ == "__main__":
+    process_captions()

demos/fine_tuner/encode_videos.py ADDED Viewed

	@@ -0,0 +1,142 @@

+#! /usr/bin/env python3
+import os
+from pathlib import Path
+import traceback
+from typing import Optional
+import click
+import ray
+import torch
+import torchvision
+from einops import rearrange
+import genmo.mochi_preview.dit.joint_model.context_parallel as cp
+import genmo.mochi_preview.vae.cp_conv as cp_conv
+from genmo.lib.progress import get_new_progress_bar, progress_bar
+from genmo.lib.utils import Timer, save_video
+from genmo.mochi_preview.pipelines import DecoderModelFactory, EncoderModelFactory
+from genmo.mochi_preview.vae.models import add_fourier_features, decode_latents
+class GPUContext:
+    def __init__(
+        self,
+        *,
+        encoder_factory: Optional[EncoderModelFactory] = None,
+        decoder_factory: Optional[DecoderModelFactory] = None,
+    ):
+        t = Timer()
+        self.device = torch.device(f"cuda")
+        if encoder_factory is not None:
+            with t("load_encoder"):
+                self.encoder = encoder_factory.get_model()
+        if decoder_factory is not None:
+            with t("load_decoder"):
+                self.decoder = decoder_factory.get_model()
+        t.print_stats()
+def preprocess(ctx: GPUContext, vid_path: Path, shape: str, reconstruct: bool):
+    T, H, W = [int(s) for s in shape.split("x")]
+    assert (T - 1) % 6 == 0, "Expected T to be 1 mod 6"
+    video, _, metadata = torchvision.io.read_video(
+        str(vid_path), output_format="THWC", pts_unit="secs")
+    fps = metadata["video_fps"]
+    video = rearrange(video, "t h w c -> c t h w")
+    og_shape = video.shape
+    assert video.shape[2] == H, f"Expected {vid_path} to have height {H}, got {video.shape}"
+    assert video.shape[3] == W, f"Expected {vid_path} to have width {W}, got {video.shape}"
+    assert video.shape[1] >= T, f"Expected {vid_path} to have at least {T} frames, got {video.shape}"
+    if video.shape[1] > T:
+        video = video[:, :T]
+        print(f"Trimmed video from {og_shape[1]} to first {T} frames")
+    video = video.unsqueeze(0)
+    video = video.float() / 127.5 - 1.0
+    video = video.to(ctx.device)
+    video = add_fourier_features(video)
+    assert video.ndim == 5
+    video = cp.local_shard(video, dim=2)  # split along time dimension
+    with torch.inference_mode():
+        with torch.autocast("cuda", dtype=torch.bfloat16):
+            ldist = ctx.encoder(video)
+        print(f"{og_shape} -> {ldist.mean.shape}")
+        torch.save(
+            dict(mean=ldist.mean, logvar=ldist.logvar),
+            vid_path.with_suffix(".latent.pt"),
+        )
+        if reconstruct:
+            latents = ldist.sample()
+            frames = decode_latents(ctx.decoder, latents)
+            frames = frames.cpu().numpy()
+            save_video(frames[0], str(vid_path.with_suffix(".recon.mp4")), fps=fps)
+@click.command()
+@click.argument("videos_dir", type=click.Path(exists=True, file_okay=False, dir_okay=True, path_type=Path))
+@click.option(
+    "--model_dir",
+    type=click.Path(exists=True, file_okay=False, dir_okay=True, path_type=Path),
+    help="Path to folder containing Mochi's VAE encoder and decoder weights. Download from Hugging Face: https://huggingface.co/genmo/mochi-1-preview/blob/main/encoder.safetensors and https://huggingface.co/genmo/mochi-1-preview/blob/main/decoder.safetensors",
+    default="weights/",
+)
+@click.option("--num_gpus", default=1, help="Number of GPUs to split the encoder over")
+@click.option(
+    "--recon_interval", default=10, help="Reconstruct one out of every N videos (0 to disable reconstruction)"
+)
+@click.option("--shape", default="163x480x848", help="Shape of the video to encode")
+@click.option("--overwrite", "-ow", is_flag=True, help="Overwrite existing latents")
+def batch_process(
+    videos_dir: Path, model_dir: Path, num_gpus: int, recon_interval: int, shape: str, overwrite: bool
+) -> None:
+    """Process all videos in a directory using multiple GPUs.
+    Args:
+        videos_dir: Directory containing input videos
+        encoder_path: Path to encoder model weights
+        decoder_path: Path to decoder model weights
+        num_gpus: Number of GPUs to use for parallel processing
+        recon_interval: Frequency of video reconstructions (0 to disable)
+    """
+    torch.backends.cuda.matmul.allow_tf32 = True
+    torch.backends.cudnn.allow_tf32 = True
+    # Get all video paths
+    video_paths = list(videos_dir.glob("**/*.mp4"))
+    if not video_paths:
+        print(f"No MP4 files found in {videos_dir}")
+        return
+    preproc = GPUContext(
+        encoder_factory=EncoderModelFactory(model_path=os.path.join(model_dir, "encoder.safetensors")),
+        decoder_factory=DecoderModelFactory(model_path=os.path.join(model_dir, "decoder.safetensors")),
+    )
+    with progress_bar(type="ray_tqdm"):
+        for idx, video_path in get_new_progress_bar((list(enumerate(sorted(video_paths))))):
+            if str(video_path).endswith(".recon.mp4"):
+                print(f"Skipping {video_path} b/c it is a reconstruction")
+                continue
+            print(f"Processing {video_path}")
+            try:
+                if video_path.with_suffix(".latent.pt").exists() and not overwrite:
+                    print(f"Skipping {video_path}")
+                    continue
+                preprocess(
+                    ctx=preproc,
+                    vid_path=video_path,
+                    shape=shape,
+                    reconstruct=recon_interval != 0 and idx % recon_interval == 0,
+                )
+            except Exception as e:
+                traceback.print_exc()
+                print(f"Error processing {video_path}: {str(e)}")
+if __name__ == "__main__":
+    batch_process()

demos/fine_tuner/preprocess.bash ADDED Viewed

	@@ -0,0 +1,64 @@

+#! /bin/bash
+# Enable job control and set process group
+set -eo pipefail
+set -x
+# Function to display help
+usage() {
+  echo "Usage: $0 -v|--videos_dir videos_dir -o|--output_dir output_dir -w|--weights_dir weights_dir -n|--num_frames num_frames"
+  echo "  -v, --videos_dir            Path to the videos directory"
+  echo "  -o, --output_dir            Path to the output directory"
+  echo "  -w, --weights_dir           Path to the weights directory"
+  echo "  -n, --num_frames            Number of frames"
+  exit 1
+}
+# Function to check if the next argument is missing
+check_argument() {
+  if [[ -z "$2" || "$2" == -* ]]; then
+    echo "Error: Argument for $1 is missing"
+    usage
+  fi
+}
+# Parse command-line arguments
+while [[ "$#" -gt 0 ]]; do
+  case $1 in
+    -v|--videos_dir) check_argument "$1" "$2"; VIDEOS_DIR="$2"; shift ;;
+    -o|--output_dir) check_argument "$1" "$2"; OUTPUT_DIR="$2"; shift ;;
+    -w|--weights_dir) check_argument "$1" "$2"; WEIGHTS_DIR="$2"; shift ;;
+    -n|--num_frames) check_argument "$1" "$2"; NUM_FRAMES="$2"; shift ;;
+    -h|--help) usage ;;
+    *) echo "Unknown parameter passed: $1"; usage ;;
+  esac
+  shift
+done
+# Check if all required arguments are provided
+if [[ -z "$VIDEOS_DIR" || -z "$OUTPUT_DIR" || -z "$WEIGHTS_DIR" || -z "$NUM_FRAMES" ]]; then
+  echo "Error: All arguments are required."
+  usage
+fi
+# Get the directory where this script is located
+SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+echo "Using script directory: ${SCRIPT_DIR}"
+##### Step 1: Trim and resize videos
+echo -e "\n\e[1;35m🎬 **Step 1: Trim and resize videos** \e[0m"
+# Calculate duration to trim videos
+DURATION=$(printf "%.1f" "$(echo "($NUM_FRAMES / 30) + 0.09" | bc -l)")
+echo "Trimming videos to duration: ${DURATION} seconds"
+python3 ${SCRIPT_DIR}/trim_and_crop_videos.py ${VIDEOS_DIR} ${OUTPUT_DIR} -d ${DURATION}
+##### Step 2: Run the VAE encoder on each video.
+echo -e "\n\e[1;35m🎥 **Step 2: Run the VAE encoder on each video** \e[0m"
+python3 ${SCRIPT_DIR}/encode_videos.py ${OUTPUT_DIR} \
+  --model_dir ${WEIGHTS_DIR} --num_gpus 1 --shape "${NUM_FRAMES}x480x848" --overwrite
+##### Step 3: Compute T5 embeddings
+echo -e "\n\e[1;35m🧠 **Step 3: Compute T5 embeddings** \e[0m"
+python3 ${SCRIPT_DIR}/embed_captions.py --overwrite ${OUTPUT_DIR}
+echo -e "\n\e[1;32m✓ Done!\e[0m"

demos/fine_tuner/run.bash ADDED Viewed

	@@ -0,0 +1,92 @@

+#! /bin/bash
+# Enable job control and set process group
+set -m
+trap 'kill $(jobs -p)' EXIT INT TERM
+# Get the directory where this script is located
+SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+DEFAULT_CONFIG="${SCRIPT_DIR}/configs/finetune.yaml"
+# Parse command line arguments
+usage() {
+    echo "Usage: $0 [-c|--config <config_path>] [-n|--num-gpus <num_gpus>]"
+    echo "  -c, --config     Path to config file (default: ${DEFAULT_CONFIG})"
+    echo "  -n, --num-gpus   Number of GPUs to use (default: 8)"
+    exit 1
+}
+# Default values
+CONFIG_PATH="${DEFAULT_CONFIG}"
+NUM_GPUS=8
+# Parse arguments
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        -c|--config)
+            CONFIG_PATH="$2"
+            shift 2
+            ;;
+        -n|--num-gpus)
+            NUM_GPUS="$2"
+            shift 2
+            ;;
+        -h|--help)
+            usage
+            ;;
+        *)
+            echo "Unknown option: $1"
+            usage
+            ;;
+    esac
+done
+# Validate config file exists
+if [ ! -f "${CONFIG_PATH}" ]; then
+    echo "Config file not found at ${CONFIG_PATH}"
+    exit 1
+fi
+# Validate num_gpus is a positive integer
+if ! [[ "$NUM_GPUS" =~ ^[1-9][0-9]*$ ]]; then
+    echo "Number of GPUs must be a positive integer"
+    exit 1
+fi
+# Set distributed training environment variables
+export MASTER_PORT=29500
+export MASTER_ADDR="localhost"
+export WORLD_SIZE=$NUM_GPUS
+export TF_CPP_MIN_LOG_LEVEL=3
+export COMPILE_DIT=1
+# Set IS_DISTRIBUTED based on NUM_GPUS
+if [ "$NUM_GPUS" -gt 1 ]; then
+    export IS_DISTRIBUTED=true
+fi
+# Load .env file (if it exists)
+if [ -f ".env" ]; then
+    export $(grep -v '^#' .env | xargs)
+fi
+echo "Starting training with ${NUM_GPUS} GPU(s), mode: ${IS_DISTRIBUTED:+distributed}${IS_DISTRIBUTED:-single_gpu}"
+echo "Using config: ${CONFIG_PATH}"
+# Launch processes
+if [ "$NUM_GPUS" -gt 1 ]; then
+    for RANK in $(seq 0 $((NUM_GPUS-1))); do
+        env RANK=$RANK CUDA_VISIBLE_DEVICES=$RANK python "${SCRIPT_DIR}/train.py" --config-path "${CONFIG_PATH}" &
+    done
+else
+    python "${SCRIPT_DIR}/train.py" --config-path "${CONFIG_PATH}" &
+fi
+# Wait for all background processes to complete
+wait
+# Check if any process failed
+if [ $? -ne 0 ]; then
+    echo "One or more training processes failed"
+    exit 1
+fi

demos/fine_tuner/train.py ADDED Viewed

	@@ -0,0 +1,396 @@

+import json
+import multiprocessing as mp
+import os
+import random
+import re
+import sys
+import time
+from contextlib import contextmanager
+from glob import glob
+from pathlib import Path
+from typing import Any, Dict, Tuple, cast
+import click
+import numpy as np
+from omegaconf import DictConfig, ListConfig, OmegaConf
+from safetensors.torch import save_file
+import torch
+from torch import Tensor
+from torch.distributed.checkpoint.state_dict import StateDictOptions, get_state_dict
+import torch.nn.functional as F
+from tqdm import tqdm
+torch._dynamo.config.cache_size_limit = 32
+torch.backends.cuda.matmul.allow_tf32 = True
+torch.backends.cudnn.allow_tf32 = True
+torch.use_deterministic_algorithms(False)
+import genmo.mochi_preview.dit.joint_model.lora as lora
+from genmo.lib.progress import progress_bar
+from genmo.lib.utils import Timer, save_video
+from genmo.mochi_preview.pipelines import (
+    DecoderModelFactory,
+    DitModelFactory,
+    ModelFactory,
+    T5ModelFactory,
+    cast_dit,
+    compute_packed_indices,
+    get_conditioning,
+    linear_quadratic_schedule,  # used in eval'd Python code in lora.yaml
+    load_to_cpu,
+    move_to_device,
+    sample_model,
+    t5_tokenizer,
+)
+from genmo.mochi_preview.vae.latent_dist import LatentDistribution
+from genmo.mochi_preview.vae.models import decode_latents_tiled_spatial
+sys.path.append("..")
+from dataset import LatentEmbedDataset
+class MochiTorchRunEvalPipeline:
+    def __init__(
+        self,
+        *,
+        device_id,
+        dit,
+        text_encoder_factory: ModelFactory,
+        decoder_factory: ModelFactory,
+    ):
+        self.device = torch.device(f"cuda:{device_id}")
+        self.tokenizer = t5_tokenizer()
+        t = Timer()
+        self.dit = dit
+        with t("load_text_encoder"):
+            self.text_encoder = text_encoder_factory.get_model(
+                local_rank=0,
+                world_size=1,
+                device_id="cpu",
+            )
+        with t("load_vae"):
+            self.decoder = decoder_factory.get_model(local_rank=0, device_id="cpu", world_size=1)
+        t.print_stats()  # type: ignore
+    def __call__(self, prompt, save_path, **kwargs):
+        with progress_bar(type="tqdm", enabled=True), torch.inference_mode():
+            # Encode prompt with T5 XXL.
+            with move_to_device(self.text_encoder, self.device, enabled=True):
+                conditioning = get_conditioning(
+                    self.tokenizer,
+                    self.text_encoder,
+                    self.device,
+                    batch_inputs=False,
+                    prompt=prompt,
+                    negative_prompt="",
+                )
+            # Sample video latents from Mochi.
+            with move_to_device(self.dit, self.device, enabled=True):
+                latents = sample_model(self.device, self.dit, conditioning, **kwargs)
+            # Decode video latents to frames.
+            with move_to_device(self.decoder, self.device, enabled=True):
+                frames = decode_latents_tiled_spatial(
+                    self.decoder, latents, num_tiles_w=2, num_tiles_h=2, overlap=8)
+            frames = frames.cpu().numpy()  # b t h w c
+            assert isinstance(frames, np.ndarray)
+            save_video(frames[0], save_path)
+def map_to_device(x, device: torch.device):
+    if isinstance(x, dict):
+        return {k: map_to_device(v, device) for k, v in x.items()}
+    elif isinstance(x, list):
+        return [map_to_device(y, device) for y in x]
+    elif isinstance(x, tuple):
+        return tuple(map_to_device(y, device) for y in x)
+    elif isinstance(x, torch.Tensor):
+        return x.to(device, non_blocking=True)
+    else:
+        return x
+EPOCH_IDX = 0
+def infinite_dl(dl):
+    global EPOCH_IDX
+    while True:
+        EPOCH_IDX += 1
+        for batch in dl:
+            yield batch
+@contextmanager
+def timer(description="Task", enabled=True):
+    if enabled:
+        start = time.perf_counter()
+    try:
+        yield
+    finally:
+        if enabled:
+            elapsed = time.perf_counter() - start  # type: ignore
+            print(f"{description} took {elapsed:.4f} seconds")
+def get_cosine_annealing_lr_scheduler(
+    optimizer: torch.optim.Optimizer,
+    warmup_steps: int,
+    total_steps: int,
+):
+    def lr_lambda(step):
+        if step < warmup_steps:
+            return float(step) / float(max(1, warmup_steps))
+        else:
+            return 0.5 * (1 + np.cos(np.pi * (step - warmup_steps) / (total_steps - warmup_steps)))
+    return torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)
+@click.command()
+@click.option("--config-path", type=click.Path(exists=True), required=True, help="Path to YAML config file")
+def main(config_path):
+    mp.set_start_method("spawn", force=True)
+    cfg = cast(DictConfig, OmegaConf.load(config_path))
+    device_id = 0
+    device_str = f"cuda:0"
+    device = torch.device(device_str)
+    # Verify checkpoint path exists
+    checkpoint_path = Path(cfg.init_checkpoint_path)
+    assert checkpoint_path.exists(), f"Checkpoint file not found: {checkpoint_path}"
+    # Create checkpoint directory if it doesn't exist
+    checkpoint_dir = Path(cfg.checkpoint_dir)
+    checkpoint_dir.mkdir(parents=True, exist_ok=True)
+    # Get step number from checkpoint filename
+    pattern = r"model_(\d+)\.(lora|checkpoint)\.(safetensors|pt)"
+    match = re.search(pattern, str(checkpoint_path))
+    if match:
+        start_step_num = int(match.group(1))
+        opt_path = str(checkpoint_path).replace("model_", "optimizer_")
+    else:
+        start_step_num = 0
+        opt_path = ""
+    print(
+        f"model={checkpoint_path}, optimizer={opt_path}, start_step_num={start_step_num}"
+    )
+    wandb_run = None
+    sample_prompts = cfg.sample.prompts
+    train_vids = list(sorted(glob(f"{cfg.train_data_dir}/*.mp4")))
+    train_vids = [v for v in train_vids if not v.endswith(".recon.mp4")]
+    print(f"Found {len(train_vids)} training videos in {cfg.train_data_dir}")
+    assert len(train_vids) > 0, f"No training data found in {cfg.train_data_dir}"
+    if cfg.single_video_mode:
+        train_vids = train_vids[:1]
+        sample_prompts = [Path(train_vids[0]).with_suffix(".txt").read_text()]
+        print(f"Training on video: {train_vids[0]}")
+    train_dataset = LatentEmbedDataset(
+        train_vids,
+        repeat=1_000 if cfg.single_video_mode else 1,
+    )
+    train_dl = torch.utils.data.DataLoader(
+        train_dataset,
+        batch_size=None,
+        num_workers=4,
+        shuffle=True,
+        pin_memory=True,
+    )
+    train_dl_iter = infinite_dl(train_dl)
+    if cfg.get("wandb"):
+        import wandb
+        wandb_run = wandb.init(
+            project=cfg.wandb.project,
+            name=f"{cfg.wandb.name}-{int(time.time())}",
+            config=OmegaConf.to_container(cfg),  # type: ignore
+        )
+        print(f"🚀 Weights & Biases run URL: {wandb_run.get_url()}")
+    print("Loading model")
+    patch_model_fns = []
+    model_kwargs = {}
+    is_lora = cfg.model.type == "lora"
+    print(f"Training type: {'LoRA' if is_lora else 'Full'}")
+    if is_lora:
+        def mark_lora_params(m):
+            lora.mark_only_lora_as_trainable(m, bias="none")
+            return m
+        patch_model_fns.append(mark_lora_params)
+        model_kwargs = dict(**cfg.model.kwargs)
+        # Replace ListConfig with list to allow serialization to JSON.
+        for k, v in model_kwargs.items():
+            if isinstance(v, ListConfig):
+                model_kwargs[k] = list(v)
+    if cfg.training.get("model_dtype"):
+        assert cfg.training.model_dtype == "bf16", f"Only bf16 is supported"
+        patch_model_fns.append(lambda m: cast_dit(m, torch.bfloat16))
+    model = (
+        DitModelFactory(
+            model_path=str(checkpoint_path),
+            model_dtype="bf16",
+            attention_mode=cfg.attention_mode
+        ).get_model(
+            local_rank=0,
+            device_id=device_id,
+            model_kwargs=model_kwargs,
+            patch_model_fns=patch_model_fns,
+            world_size=1,
+            strict_load=not is_lora,
+            fast_init=not is_lora,  # fast_init not supported for LoRA (please someone fix this !!!)
+        )
+        .train()  # calling train() makes sure LoRA weights are not merged
+    )
+    optimizer = torch.optim.AdamW(model.parameters(), **cfg.optimizer)
+    if os.path.exists(opt_path):
+        print("Loading optimizer")
+        optimizer.load_state_dict(load_to_cpu(opt_path))
+    scheduler = get_cosine_annealing_lr_scheduler(
+        optimizer,
+        warmup_steps=cfg.training.warmup_steps,
+        total_steps=cfg.training.num_steps
+    )
+    print("Loading eval pipeline ...")
+    eval_pipeline = MochiTorchRunEvalPipeline(
+        device_id=device_id,
+        dit=model,
+        text_encoder_factory=T5ModelFactory(),
+        decoder_factory=DecoderModelFactory(model_path=cfg.sample.decoder_path),
+    )
+    def get_batch() -> Tuple[Dict[str, Any], Tensor, Tensor, Tensor]:
+        nonlocal train_dl_iter
+        batch = next(train_dl_iter)  # type: ignore
+        latent, embed = cast(Tuple[Dict[str, Any], Dict[str, Any]], batch)
+        assert len(embed["y_feat"]) == 1 and len(embed["y_mask"]) == 1, f"Only batch size 1 is supported"
+        ldist = LatentDistribution(latent["mean"], latent["logvar"])
+        z = ldist.sample()
+        assert torch.isfinite(z).all()
+        assert z.shape[0] == 1, f"Only batch size 1 is supported"
+        eps = torch.randn_like(z)
+        sigma = torch.rand(z.shape[:1], device="cpu", dtype=torch.float32)
+        if random.random() < cfg.training.caption_dropout:
+            embed["y_mask"][0].zero_()
+            embed["y_feat"][0].zero_()
+        return embed, z, eps, sigma
+    pbar = tqdm(
+        range(start_step_num, cfg.training.num_steps),
+        total=cfg.training.num_steps,
+        initial=start_step_num,
+    )
+    for step in pbar:
+        if cfg.sample.interval and step % cfg.sample.interval == 0 and step > 0:
+            sample_dir = Path(cfg.sample.output_dir)
+            sample_dir.mkdir(exist_ok=True)
+            model.eval()
+            for eval_idx, prompt in enumerate(sample_prompts):
+                save_path = sample_dir / f"{eval_idx}_{step}.mp4"
+                if save_path.exists():
+                    print(f"Skipping {save_path} as it already exists")
+                    continue
+                sample_kwargs = {
+                    k.removesuffix("_python_code"): (eval(v) if k.endswith("_python_code") else v)
+                    for k, v in cfg.sample.kwargs.items()
+                }
+                eval_pipeline(
+                    prompt=prompt,
+                    save_path=str(save_path),
+                    seed=cfg.sample.seed + eval_idx,
+                    **sample_kwargs,
+                )
+                Path(sample_dir / f"{eval_idx}_{step}.txt").write_text(prompt)
+            model.train()
+        if cfg.training.save_interval and step > 0 and step % cfg.training.save_interval == 0:
+            with timer("get_state_dict"):
+                if is_lora:
+                    model_sd = lora.lora_state_dict(model, bias="none")
+                else:
+                    # NOTE: Not saving optimizer state dict to save space.
+                    model_sd, _optimizer_sd = get_state_dict(
+                        model, [], options=StateDictOptions(cpu_offload=True, full_state_dict=True)
+                    )
+            checkpoint_filename = f"model_{step}.{'lora' if is_lora else 'checkpoint'}.pt"
+            save_path = checkpoint_dir / checkpoint_filename
+            if cfg.training.get("save_safetensors", True):
+                save_path = save_path.with_suffix(".safetensors")
+                save_file(
+                    model_sd, save_path,
+                    # `safetensors` only supports string-to-string metadata,
+                    # so we serialize the kwargs to a JSON string.
+                    metadata=dict(kwargs=json.dumps(model_kwargs)),
+                )
+            else:
+                torch.save(model_sd, save_path)
+        with torch.no_grad(), timer("load_batch", enabled=False):
+            batch = get_batch()
+            embed, z, eps, sigma = map_to_device(batch, device)
+            embed = cast(Dict[str, Any], embed)
+            num_latent_toks = np.prod(z.shape[-3:])
+            indices = compute_packed_indices(device, cast(Tensor, embed["y_mask"][0]), int(num_latent_toks))
+            sigma_bcthw = sigma[:, None, None, None, None]  # [B, 1, 1, 1, 1]
+            z_sigma = (1 - sigma_bcthw) * z + sigma_bcthw * eps
+            ut = z - eps
+        with torch.autocast("cuda", dtype=torch.bfloat16):
+            preds = model(
+                x=z_sigma,
+                sigma=sigma,
+                packed_indices=indices,
+                **embed,
+                num_ff_checkpoint=cfg.training.num_ff_checkpoint,
+                num_qkv_checkpoint=cfg.training.num_qkv_checkpoint,
+            )
+            assert preds.shape == z.shape
+        loss = F.mse_loss(preds.float(), ut.float())
+        loss.backward()
+        log_kwargs = {
+            "train/loss": loss.item(),
+            "train/epoch": EPOCH_IDX,
+            "train/lr": scheduler.get_last_lr()[0],
+        }
+        if cfg.training.get("grad_clip"):
+            assert not is_lora, "Gradient clipping not supported for LoRA"
+            gnorm_before_clip = torch.nn.utils.clip_grad_norm_(
+                model.parameters(), max_norm=cfg.training.grad_clip)
+            log_kwargs["train/gnorm"] = gnorm_before_clip.item()
+        pbar.set_postfix(**log_kwargs)
+        if wandb_run:
+            wandb_run.log(log_kwargs, step=step)
+        optimizer.step()
+        scheduler.step()
+        optimizer.zero_grad()
+if __name__ == "__main__":
+    main()

demos/fine_tuner/trim_and_crop_videos.py ADDED Viewed

	@@ -0,0 +1,110 @@

+#! /usr/bin/env python3
+from pathlib import Path
+import shutil
+import click
+from moviepy.editor import VideoFileClip
+from tqdm import tqdm
+@click.command()
+@click.argument("folder", type=click.Path(exists=True, dir_okay=True))
+@click.argument("output_folder", type=click.Path(dir_okay=True))
+@click.option("--duration", "-d", type=float, default=5.4, help="Duration in seconds")
+@click.option("--resolution", "-r", type=str, default="848x480", help="Video resolution")
+def truncate_videos(folder, output_folder, duration, resolution):
+    """Truncate all MP4 and MOV files in FOLDER to specified duration and resolution"""
+    input_path = Path(folder)
+    output_path = Path(output_folder)
+    output_path.mkdir(parents=True, exist_ok=True)
+    # Parse target resolution
+    target_width, target_height = map(int, resolution.split("x"))
+    # Find all MP4 and MOV files
+    video_files = (
+        list(input_path.rglob("*.mp4"))
+        + list(input_path.rglob("*.MOV"))
+        + list(input_path.rglob("*.mov"))
+        + list(input_path.rglob("*.MP4"))
+    )
+    for file_path in tqdm(video_files):
+        try:
+            relative_path = file_path.relative_to(input_path)
+            output_file = output_path / relative_path.with_suffix(".mp4")
+            output_file.parent.mkdir(parents=True, exist_ok=True)
+            click.echo(f"Processing: {file_path}")
+            video = VideoFileClip(str(file_path))
+            # Skip if video is too short
+            if video.duration < duration:
+                click.echo(f"Skipping {file_path} as it is too short")
+                continue
+            # Skip if target resolution is larger than input
+            if target_width > video.w or target_height > video.h:
+                click.echo(
+                    f"Skipping {file_path} as target resolution {resolution} is larger than input {video.w}x{video.h}"
+                )
+                continue
+            # First truncate duration
+            truncated = video.subclip(0, duration)
+            # Calculate crop dimensions to maintain aspect ratio
+            target_ratio = target_width / target_height
+            current_ratio = truncated.w / truncated.h
+            if current_ratio > target_ratio:
+                # Video is wider than target ratio - crop width
+                new_width = int(truncated.h * target_ratio)
+                x1 = (truncated.w - new_width) // 2
+                final = truncated.crop(x1=x1, width=new_width).resize((target_width, target_height))
+            else:
+                # Video is taller than target ratio - crop height
+                new_height = int(truncated.w / target_ratio)
+                y1 = (truncated.h - new_height) // 2
+                final = truncated.crop(y1=y1, height=new_height).resize((target_width, target_height))
+            # Set output parameters for consistent MP4 encoding
+            output_params = {
+                "codec": "libx264",
+                "audio": False,  # Disable audio
+                "preset": "medium",  # Balance between speed and quality
+                "bitrate": "5000k",  # Adjust as needed
+            }
+            # Set FPS to 30
+            final = final.set_fps(30)
+            # Check for a corresponding .txt file
+            txt_file_path = file_path.with_suffix('.txt')
+            if txt_file_path.exists():
+                output_txt_file = output_path / relative_path.with_suffix('.txt')
+                output_txt_file.parent.mkdir(parents=True, exist_ok=True)
+                shutil.copy(txt_file_path, output_txt_file)
+                click.echo(f"Copied {txt_file_path} to {output_txt_file}")
+            else:
+                # Print warning in bold yellow with a warning emoji
+                click.echo(f"\033[1;33m⚠️  Warning: No caption found for {file_path}, using an empty caption. This may hurt fine-tuning quality.\033[0m")
+                output_txt_file = output_path / relative_path.with_suffix('.txt')
+                output_txt_file.parent.mkdir(parents=True, exist_ok=True)
+                output_txt_file.touch()
+            # Write the output file
+            final.write_videofile(str(output_file), **output_params)
+            # Clean up
+            video.close()
+            truncated.close()
+            final.close()
+        except Exception as e:
+            click.echo(f"\033[1;31m Error processing {file_path}: {str(e)}\033[0m", err=True)
+            raise
+if __name__ == "__main__":
+    truncate_videos()

demos/gradio_ui.py ADDED Viewed

	@@ -0,0 +1,57 @@

+#! /usr/bin/env python
+import sys
+import click
+import gradio as gr
+sys.path.append("..")
+from cli import configure_model, generate_video
+with gr.Blocks() as demo:
+    gr.Markdown("Video Generator")
+    with gr.Row():
+        prompt = gr.Textbox(
+            label="Prompt",
+            value="A hand with delicate fingers picks up a bright yellow lemon from a wooden bowl filled with lemons and sprigs of mint against a peach-colored background. The hand gently tosses the lemon up and catches it, showcasing its smooth texture. A beige string bag sits beside the bowl, adding a rustic touch to the scene. Additional lemons, one halved, are scattered around the base of the bowl. The even lighting enhances the vibrant colors and creates a fresh, inviting atmosphere.",
+        )
+        negative_prompt = gr.Textbox(label="Negative Prompt", value="")
+        seed = gr.Number(label="Seed", value=1710977262, precision=0)
+    with gr.Row():
+        width = gr.Number(label="Width", value=848, precision=0)
+        height = gr.Number(label="Height", value=480, precision=0)
+        num_frames = gr.Number(label="Number of Frames", value=163, precision=0)
+    with gr.Row():
+        cfg_scale = gr.Number(label="CFG Scale", value=6.0)
+        num_inference_steps = gr.Number(label="Number of Inference Steps", value=100, precision=0)
+    btn = gr.Button("Generate Video")
+    output = gr.Video()
+    btn.click(
+        generate_video,
+        inputs=[
+            prompt,
+            negative_prompt,
+            width,
+            height,
+            num_frames,
+            seed,
+            cfg_scale,
+            num_inference_steps,
+        ],
+        outputs=output,
+    )
+@click.command()
+@click.option("--model_dir", required=True, help="Path to the model directory.")
+@click.option("--lora_path", required=False, help="Path to the lora file.")
+@click.option("--cpu_offload", is_flag=True, help="Whether to offload model to CPU")
+def launch(model_dir, lora_path, cpu_offload):
+    configure_model(model_dir, lora_path, cpu_offload)
+    demo.launch()
+if __name__ == "__main__":
+    launch()

demos/test_encoder_decoder.py ADDED Viewed

	@@ -0,0 +1,79 @@

+import time
+import click
+import torch
+import torchvision
+from einops import rearrange
+from safetensors.torch import load_file
+from genmo.lib.utils import save_video
+from genmo.mochi_preview.pipelines import DecoderModelFactory, decode_latents_tiled_spatial
+from genmo.mochi_preview.vae.models import Encoder, add_fourier_features
+@click.command()
+@click.argument("mochi_dir", type=str)
+@click.argument("video_path", type=click.Path(exists=True))
+def reconstruct(mochi_dir, video_path):
+    torch.backends.cuda.matmul.allow_tf32 = True
+    torch.backends.cudnn.allow_tf32 = True
+    decoder_factory = DecoderModelFactory(
+        model_path=f"{mochi_dir}/decoder.safetensors",
+    )
+    decoder = decoder_factory.get_model(world_size=1, device_id=0, local_rank=0)
+    config = dict(
+        prune_bottlenecks=[False, False, False, False, False],
+        has_attentions=[False, True, True, True, True],
+        affine=True,
+        bias=True,
+        input_is_conv_1x1=True,
+        padding_mode="replicate",
+    )
+    # Create VAE encoder
+    encoder = Encoder(
+        in_channels=15,
+        base_channels=64,
+        channel_multipliers=[1, 2, 4, 6],
+        num_res_blocks=[3, 3, 4, 6, 3],
+        latent_dim=12,
+        temporal_reductions=[1, 2, 3],
+        spatial_reductions=[2, 2, 2],
+        **config,
+    )
+    device = torch.device("cuda:0")
+    encoder = encoder.to(device, memory_format=torch.channels_last_3d)
+    encoder.load_state_dict(load_file(f"{mochi_dir}/encoder.safetensors"))
+    encoder.eval()
+    video, _, metadata = torchvision.io.read_video(video_path, output_format="THWC")
+    fps = metadata["video_fps"]
+    video = rearrange(video, "t h w c -> c t h w")
+    video = video.unsqueeze(0)
+    assert video.dtype == torch.uint8
+    # Convert to float in [-1, 1] range.
+    video = video.float() / 127.5 - 1.0
+    video = video.to(device)
+    video = add_fourier_features(video)
+    torch.cuda.synchronize()
+    # Encode video to latent
+    with torch.inference_mode():
+        with torch.autocast("cuda", dtype=torch.bfloat16):
+            t0 = time.time()
+            ldist = encoder(video)
+            torch.cuda.synchronize()
+            print(f"Time to encode: {time.time() - t0:.2f}s")
+            t0 = time.time()
+            frames = decode_latents_tiled_spatial(decoder, ldist.sample(), num_tiles_w=2, num_tiles_h=2)
+            torch.cuda.synchronize()
+            print(f"Time to decode: {time.time() - t0:.2f}s")
+    t0 = time.time()
+    save_video(frames.cpu().numpy()[0], f"{video_path}.recon.mp4", fps=fps)
+    print(f"Time to save: {time.time() - t0:.2f}s")
+if __name__ == "__main__":
+    reconstruct()

encoder.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d3a8827a66b58a479d97420a9bf77e59078d88f538298469d8db28c37bd556ae
+size 388912864

model_index.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "_class_name": "MochiPipeline",
+  "_diffusers_version": "0.32.0.dev0",
+  "scheduler": [
+    "diffusers",
+    "FlowMatchEulerDiscreteScheduler"
+  ],
+  "text_encoder": [
+    "transformers",
+    "T5EncoderModel"
+  ],
+  "tokenizer": [
+    "transformers",
+    "T5Tokenizer"
+  ],
+  "transformer": [
+    "diffusers",
+    "MochiTransformer3DModel"
+  ],
+  "vae": [
+    "diffusers",
+    "AutoencoderKLMochi"
+  ]
+}

pusa_v0_dit.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0b59c4675886494861ec02ea3a11706815efa0e8909dc868109b0b25e79bfcb0
+size 40110801256

pyproject.toml ADDED Viewed

	@@ -0,0 +1,37 @@

+[project]
+name = "genmo"
+version = "0.1.0"
+description = "Genmo models"
+readme = "README.md"
+requires-python = ">=3.10"
+dependencies = [
+    "addict>=2.4.0",
+    "av==13.1.0",
+    "click>=8.1.7",
+    "einops>=0.8.0",
+    "gradio>=3.36.1",
+    "moviepy==1.0.3",
+    "omegaconf>=2.3.0",
+    "pillow==9.5.0",
+    "pyyaml>=6.0.2",
+    "ray>=2.37.0",
+    "sentencepiece>=0.2.0",
+    "setuptools>=75.2.0",
+    "torch>=2.4.1",
+    "torchvision>=0.19.1",
+    "transformers>=4.45.2",
+]
+[project.optional-dependencies]
+flash = [
+    "flash-attn>=2.6.3"
+]
+torchvision = [
+    "torchvision>=0.15.0",
+    "pyav>=13.1.0"
+]
+[tool.ruff]
+# Allow lines to be as long as 120.
+line-length = 120

pyrightconfig.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+    "include": ["src/genmo/mochi_preview/pipelines.py"]
+}

requirements.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+addict>=2.4.0
+av==13.1.0
+click>=8.1.7
+einops>=0.8.0
+gradio>=3.36.1
+moviepy==1.0.3
+omegaconf>=2.3.0
+pillow==9.5.0
+pyyaml>=6.0.2
+ray>=2.37.0
+sentencepiece>=0.2.0
+setuptools>=75.2.0
+torch>=2.4.1
+transformers>=4.45.2

scheduler/scheduler_config.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+  "_class_name": "FlowMatchEulerDiscreteScheduler",
+  "_diffusers_version": "0.32.0.dev0",
+  "base_image_seq_len": 256,
+  "base_shift": 0.5,
+  "invert_sigmas": true,
+  "max_image_seq_len": 4096,
+  "max_shift": 1.15,
+  "num_train_timesteps": 1000,
+  "shift": 1.0,
+  "use_dynamic_shifting": false
+}

scripts/download_weights.py ADDED Viewed

	@@ -0,0 +1,41 @@

+#! /usr/bin/env python3
+import os
+import click
+from huggingface_hub import snapshot_download
+# Based off of Kijai's script
+@click.command()
+@click.argument('output_dir', required=True)
+def download_weights(output_dir):
+    repo_id = "genmo/mochi-1-preview"
+    model = "dit.safetensors"
+    decoder = "decoder.safetensors"
+    encoder = "encoder.safetensors"
+    if not os.path.exists(output_dir):
+        print(f"Creating output directory: {output_dir}")
+        os.makedirs(output_dir, exist_ok=True)
+    def download_file(repo_id, output_dir, filename, description):
+        file_path = os.path.join(output_dir, filename)
+        if not os.path.exists(file_path):
+            print(f"Downloading mochi {description} to: {file_path}")
+            snapshot_download(
+                repo_id=repo_id,
+                allow_patterns=[f"*{filename}*"],
+                local_dir=output_dir,
+                local_dir_use_symlinks=False,
+            )
+        else:
+            print(f"{description} already exists in: {file_path}")
+        assert os.path.exists(file_path)
+    download_file(repo_id, output_dir, decoder, "decoder")
+    download_file(repo_id, output_dir, encoder, "encoder")
+    download_file(repo_id, output_dir, model, "model")
+if __name__ == "__main__":
+    download_weights()

scripts/format.bash ADDED Viewed

	@@ -0,0 +1,5 @@

+#! /bin/bash
+set -euxo pipefail
+ruff format src demos
+ruff check --fix --select I src
+ruff check --fix --select I demos

scripts/pytorch_to_safe_tensors.py ADDED Viewed

	@@ -0,0 +1,24 @@

+#! /usr/bin/env python3
+from pathlib import Path
+import click
+import torch
+from safetensors.torch import save_file
+@click.command()
+@click.argument("input_path", type=click.Path(exists=True))
+def convert_to_safetensors(input_path):
+    model = torch.load(input_path)
+    model = {
+        k: v.contiguous() for k, v in model.items()
+    }
+    assert 'vae_ema' not in model
+    input_path = Path(input_path)
+    output_path = input_path.with_suffix(".safetensors")
+    save_file(model, str(output_path))
+    click.echo(f"Converted {input_path} to {output_path}")
+if __name__ == "__main__":
+    convert_to_safetensors()

scripts/typecheck.bash ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ #! /bin/bash
2	+ npx pyright

scripts/weights_to_fp8.py ADDED Viewed

File without changes

src/genmo/lib/attn_imports.py ADDED Viewed

	@@ -0,0 +1,29 @@

+from contextlib import contextmanager
+import torch
+try:
+    from flash_attn import flash_attn_varlen_func as flash_varlen_attn
+except ImportError:
+    flash_varlen_attn = None
+try:
+    from sageattention import sageattn as sage_attn
+except ImportError:
+    sage_attn = None
+from torch.nn.attention import SDPBackend, sdpa_kernel
+training_backends = [SDPBackend.FLASH_ATTENTION, SDPBackend.EFFICIENT_ATTENTION]
+eval_backends = list(training_backends)
+if torch.cuda.get_device_properties(0).major >= 9.0:
+    # Enable fast CuDNN attention on Hopper.
+    # This gives NaN on the backward pass for some reason,
+    # so only use it for evaluation.
+    eval_backends.append(SDPBackend.CUDNN_ATTENTION)
+@contextmanager
+def sdpa_attn_ctx(training: bool = False):
+    with sdpa_kernel(training_backends if training else eval_backends):
+        yield

src/genmo/lib/progress.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import contextlib
+from typing import Any, Iterable, Iterator, Optional
+try:
+    from tqdm import tqdm
+except ImportError:
+    tqdm = None
+try:
+    from ray.experimental.tqdm_ray import tqdm as ray_tqdm
+except:
+    ray_tqdm = None
+# Global state
+_current_progress_type = "none"
+_is_progress_bar_active = False
+class DummyProgressBar:
+    """A no-op progress bar that mimics tqdm interface"""
+    def __init__(self, iterable=None, **kwargs):
+        self.iterable = iterable
+    def __iter__(self):
+        return iter(self.iterable)
+    def update(self, n=1):
+        pass
+    def close(self):
+        pass
+    def set_description(self, desc):
+        pass
+def get_new_progress_bar(iterable: Optional[Iterable] = None, **kwargs) -> Any:
+    if not _is_progress_bar_active:
+        return DummyProgressBar(iterable=iterable, **kwargs)
+    if _current_progress_type == "tqdm":
+        if tqdm is None:
+            raise ImportError("tqdm is required but not installed. Please install tqdm to use the tqdm progress bar.")
+        return tqdm(iterable=iterable, **kwargs)
+    elif _current_progress_type == "ray_tqdm":
+        if ray_tqdm is None:
+            raise ImportError("ray is required but not installed. Please install ray to use the ray_tqdm progress bar.")
+        return ray_tqdm(iterable=iterable, **kwargs)
+    return DummyProgressBar(iterable=iterable, **kwargs)
+@contextlib.contextmanager
+def progress_bar(type: str = "none", enabled=True):
+    """
+    Context manager for setting progress bar type and options.
+    Args:
+        type: Type of progress bar ("none" or "tqdm")
+        **options: Options to pass to the progress bar (e.g., total, desc)
+    Raises:
+        ValueError: If progress bar type is invalid
+        RuntimeError: If progress bars are nested
+    Example:
+        with progress_bar(type="tqdm", total=100):
+            for i in get_new_progress_bar(range(100)):
+                process(i)
+    """
+    if type not in ("none", "tqdm", "ray_tqdm"):
+        raise ValueError("Progress bar type must be 'none' or 'tqdm' or 'ray_tqdm'")
+    if not enabled:
+        type = "none"
+    global _current_progress_type, _is_progress_bar_active
+    if _is_progress_bar_active:
+        raise RuntimeError("Nested progress bars are not supported")
+    _is_progress_bar_active = True
+    _current_progress_type = type
+    try:
+        yield
+    finally:
+        _is_progress_bar_active = False
+        _current_progress_type = "none"

src/genmo/lib/utils.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import os
+import subprocess
+import tempfile
+import time
+import numpy as np
+from moviepy.editor import ImageSequenceClip
+from PIL import Image
+from genmo.lib.progress import get_new_progress_bar
+class Timer:
+    def __init__(self):
+        self.times = {}  # Dictionary to store times per stage
+    def __call__(self, name):
+        print(f"Timing {name}")
+        return self.TimerContextManager(self, name)
+    def print_stats(self):
+        total_time = sum(self.times.values())
+        # Print table header
+        print("{:<20} {:>10} {:>10}".format("Stage", "Time(s)", "Percent"))
+        for name, t in self.times.items():
+            percent = (t / total_time) * 100 if total_time > 0 else 0
+            print("{:<20} {:>10.2f} {:>9.2f}%".format(name, t, percent))
+    class TimerContextManager:
+        def __init__(self, outer, name):
+            self.outer = outer  # Reference to the Timer instance
+            self.name = name
+            self.start_time = None
+        def __enter__(self):
+            self.start_time = time.perf_counter()
+            return self
+        def __exit__(self, exc_type, exc_value, traceback):
+            end_time = time.perf_counter()
+            elapsed = end_time - self.start_time
+            self.outer.times[self.name] = self.outer.times.get(self.name, 0) + elapsed
+def save_video(final_frames, output_path, fps=30):
+    assert final_frames.ndim == 4 and final_frames.shape[3] == 3, f"invalid shape: {final_frames} (need t h w c)"
+    if final_frames.dtype != np.uint8:
+        final_frames = (final_frames * 255).astype(np.uint8)
+    ImageSequenceClip(list(final_frames), fps=fps).write_videofile(output_path)
+def create_memory_tracker():
+    import torch
+    previous = [None]  # Use list for mutable closure state
+    def track(label="all2all"):
+        current = torch.cuda.memory_allocated() / 1e9
+        if previous[0] is not None:
+            diff = current - previous[0]
+            sign = "+" if diff >= 0 else ""
+            print(f"GPU memory ({label}): {current:.2f} GB ({sign}{diff:.2f} GB)")
+        else:
+            print(f"GPU memory ({label}): {current:.2f} GB")
+        previous[0] = current  # type: ignore
+    return track

src/genmo/mochi_preview/__init__.py ADDED Viewed

File without changes

src/genmo/mochi_preview/dit/joint_model/__init__.py ADDED Viewed

File without changes

src/genmo/mochi_preview/dit/joint_model/asymm_models_joint.py ADDED Viewed

	@@ -0,0 +1,737 @@

+import os
+from typing import Dict, List, Optional, Tuple
+import warnings
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from torch.nn.attention import sdpa_kernel
+import genmo.mochi_preview.dit.joint_model.context_parallel as cp
+from genmo.lib.attn_imports import flash_varlen_attn, sage_attn, sdpa_attn_ctx
+from genmo.mochi_preview.dit.joint_model.layers import (
+    FeedForward,
+    PatchEmbed,
+    RMSNorm,
+    TimestepEmbedder,
+)
+from genmo.mochi_preview.dit.joint_model.lora import LoraLinear
+from genmo.mochi_preview.dit.joint_model.mod_rmsnorm import modulated_rmsnorm
+from genmo.mochi_preview.dit.joint_model.residual_tanh_gated_rmsnorm import (
+    residual_tanh_gated_rmsnorm,
+)
+from genmo.mochi_preview.dit.joint_model.rope_mixed import (
+    compute_mixed_rotation,
+    create_position_matrix,
+)
+from genmo.mochi_preview.dit.joint_model.temporal_rope import apply_rotary_emb_qk_real
+from genmo.mochi_preview.dit.joint_model.utils import (
+    AttentionPool,
+    modulate,
+    pad_and_split_xy,
+)
+COMPILE_FINAL_LAYER = os.environ.get("COMPILE_DIT") == "1"
+COMPILE_MMDIT_BLOCK = os.environ.get("COMPILE_DIT") == "1"
+def ck(fn, *args, enabled=True, **kwargs) -> torch.Tensor:
+    if enabled:
+        return torch.utils.checkpoint.checkpoint(fn, *args, **kwargs, use_reentrant=False)
+    return fn(*args, **kwargs)
+class AsymmetricAttention(nn.Module):
+    def __init__(
+        self,
+        dim_x: int,
+        dim_y: int,
+        num_heads: int = 8,
+        qkv_bias: bool = True,
+        qk_norm: bool = False,
+        update_y: bool = True,
+        out_bias: bool = True,
+        attention_mode: str = "flash",
+        softmax_scale: Optional[float] = None,
+        device: Optional[torch.device] = None,
+        # Disable LoRA by default ...
+        qkv_proj_lora_rank: int = 0,
+        qkv_proj_lora_alpha: int = 0,
+        qkv_proj_lora_dropout: float = 0.0,
+        out_proj_lora_rank: int = 0,
+        out_proj_lora_alpha: int = 0,
+        out_proj_lora_dropout: float = 0.0,
+    ):
+        super().__init__()
+        self.attention_mode = attention_mode
+        self.dim_x = dim_x
+        self.dim_y = dim_y
+        self.num_heads = num_heads
+        self.head_dim = dim_x // num_heads
+        self.update_y = update_y
+        self.softmax_scale = softmax_scale
+        if dim_x % num_heads != 0:
+            raise ValueError(f"dim_x={dim_x} should be divisible by num_heads={num_heads}")
+        # Input layers.
+        self.qkv_bias = qkv_bias
+        qkv_lora_kwargs = dict(
+            bias=qkv_bias,
+            device=device,
+            r=qkv_proj_lora_rank,
+            lora_alpha=qkv_proj_lora_alpha,
+            lora_dropout=qkv_proj_lora_dropout,
+        )
+        self.qkv_x = LoraLinear(dim_x, 3 * dim_x, **qkv_lora_kwargs)
+        # Project text features to match visual features (dim_y -> dim_x)
+        self.qkv_y = LoraLinear(dim_y, 3 * dim_x, **qkv_lora_kwargs)
+        # Query and key normalization for stability.
+        assert qk_norm
+        self.q_norm_x = RMSNorm(self.head_dim, device=device)
+        self.k_norm_x = RMSNorm(self.head_dim, device=device)
+        self.q_norm_y = RMSNorm(self.head_dim, device=device)
+        self.k_norm_y = RMSNorm(self.head_dim, device=device)
+        # Output layers. y features go back down from dim_x -> dim_y.
+        proj_lora_kwargs = dict(
+            bias=out_bias,
+            device=device,
+            r=out_proj_lora_rank,
+            lora_alpha=out_proj_lora_alpha,
+            lora_dropout=out_proj_lora_dropout,
+        )
+        self.proj_x = LoraLinear(dim_x, dim_x, **proj_lora_kwargs)
+        self.proj_y = LoraLinear(dim_x, dim_y, **proj_lora_kwargs) if update_y else nn.Identity()
+    def run_qkv_y(self, y):
+        cp_rank, cp_size = cp.get_cp_rank_size()
+        local_heads = self.num_heads // cp_size
+        if cp.is_cp_active():
+            # Only predict local heads.
+            assert not self.qkv_bias
+            W_qkv_y = self.qkv_y.weight.view(3, self.num_heads, self.head_dim, self.dim_y)
+            W_qkv_y = W_qkv_y.narrow(1, cp_rank * local_heads, local_heads)
+            W_qkv_y = W_qkv_y.reshape(3 * local_heads * self.head_dim, self.dim_y)
+            qkv_y = F.linear(y, W_qkv_y, None)  # (B, L, 3 * local_h * head_dim)
+        else:
+            qkv_y = self.qkv_y(y)  # (B, L, 3 * dim)
+        qkv_y = qkv_y.view(qkv_y.size(0), qkv_y.size(1), 3, local_heads, self.head_dim)
+        q_y, k_y, v_y = qkv_y.unbind(2)
+        q_y = self.q_norm_y(q_y)
+        k_y = self.k_norm_y(k_y)
+        return q_y, k_y, v_y
+    def prepare_qkv(
+        self,
+        x: torch.Tensor,  # (B, M, dim_x)
+        y: torch.Tensor,  # (B, L, dim_y)
+        *,
+        scale_x: torch.Tensor,
+        scale_y: torch.Tensor,
+        rope_cos: torch.Tensor,
+        rope_sin: torch.Tensor,
+        valid_token_indices: torch.Tensor,
+        max_seqlen_in_batch: int,
+    ):
+        # Process visual features
+        x = modulated_rmsnorm(x, scale_x)  # (B, M, dim_x) where M = N / cp_group_size
+        qkv_x = self.qkv_x(x)  # (B, M, 3 * dim_x)
+        assert qkv_x.dtype == torch.bfloat16
+        qkv_x = cp.all_to_all_collect_tokens(qkv_x, self.num_heads)  # (3, B, N, local_h, head_dim)
+        # Split qkv_x into q, k, v
+        q_x, k_x, v_x = qkv_x.unbind(0)  # (B, N, local_h, head_dim)
+        q_x = self.q_norm_x(q_x)
+        q_x = apply_rotary_emb_qk_real(q_x, rope_cos, rope_sin)
+        k_x = self.k_norm_x(k_x)
+        k_x = apply_rotary_emb_qk_real(k_x, rope_cos, rope_sin)
+        # Concatenate streams
+        B, N, num_heads, head_dim = q_x.size()
+        D = num_heads * head_dim
+        # Process text features
+        if B == 1:
+            text_seqlen = max_seqlen_in_batch - N
+            if text_seqlen > 0:
+                y = y[:, :text_seqlen]  # Remove padding tokens.
+                y = modulated_rmsnorm(y, scale_y)  # (B, L, dim_y)
+                q_y, k_y, v_y = self.run_qkv_y(y)  # (B, L, local_heads, head_dim)
+                q = torch.cat([q_x, q_y], dim=1)
+                k = torch.cat([k_x, k_y], dim=1)
+                v = torch.cat([v_x, v_y], dim=1)
+            else:
+                q, k, v = q_x, k_x, v_x
+        else:
+            y = modulated_rmsnorm(y, scale_y)  # (B, L, dim_y)
+            q_y, k_y, v_y = self.run_qkv_y(y)  # (B, L, local_heads, head_dim)
+            indices = valid_token_indices[:, None].expand(-1, D)
+            q = torch.cat([q_x, q_y], dim=1).view(-1, D).gather(0, indices)  # (total, D)
+            k = torch.cat([k_x, k_y], dim=1).view(-1, D).gather(0, indices)  # (total, D)
+            v = torch.cat([v_x, v_y], dim=1).view(-1, D).gather(0, indices)  # (total, D)
+        q = q.view(-1, num_heads, head_dim)
+        k = k.view(-1, num_heads, head_dim)
+        v = v.view(-1, num_heads, head_dim)
+        return q, k, v
+    @torch.autocast("cuda", enabled=False)
+    def flash_attention(self, q, k, v, cu_seqlens, max_seqlen_in_batch, total, local_dim):
+        out: torch.Tensor = flash_varlen_attn(
+            q, k, v,
+            cu_seqlens_q=cu_seqlens,
+            cu_seqlens_k=cu_seqlens,
+            max_seqlen_q=max_seqlen_in_batch,
+            max_seqlen_k=max_seqlen_in_batch,
+            dropout_p=0.0,
+            softmax_scale=self.softmax_scale,
+        )  # (total, local_heads, head_dim)
+        return out.view(total, local_dim)
+    def sdpa_attention(self, q, k, v):
+        with sdpa_attn_ctx(training=self.training):
+            out = F.scaled_dot_product_attention(
+                q, k, v,
+                attn_mask=None,
+                dropout_p=0.0,
+                is_causal=False,
+            )
+            return out
+    @torch.autocast("cuda", enabled=False)
+    def sage_attention(self, q, k, v):
+        return sage_attn(q, k, v, attn_mask=None, dropout_p=0.0, is_causal=False)
+    def run_attention(
+        self,
+        q: torch.Tensor,  # (total <= B * (N + L), num_heads, head_dim)
+        k: torch.Tensor,  # (total <= B * (N + L), num_heads, head_dim)
+        v: torch.Tensor,  # (total <= B * (N + L), num_heads, head_dim)
+        *,
+        B: int,
+        cu_seqlens: Optional[torch.Tensor] = None,
+        max_seqlen_in_batch: Optional[int] = None,
+    ):
+        _, cp_size = cp.get_cp_rank_size()
+        assert self.num_heads % cp_size == 0
+        local_heads = self.num_heads // cp_size
+        local_dim = local_heads * self.head_dim
+        # Check shapes
+        assert q.ndim == 3 and k.ndim == 3 and v.ndim == 3
+        total = q.size(0)
+        assert k.size(0) == total and v.size(0) == total
+        if self.attention_mode == "flash":
+            out = self.flash_attention(
+                q, k, v, cu_seqlens, max_seqlen_in_batch, total, local_dim)  # (total, local_dim)
+        else:
+            assert B == 1, \
+                f"Non-flash attention mode {self.attention_mode} only supports batch size 1, got {B}"
+            q = rearrange(q, "(b s) h d -> b h s d", b=B)
+            k = rearrange(k, "(b s) h d -> b h s d", b=B)
+            v = rearrange(v, "(b s) h d -> b h s d", b=B)
+            if self.attention_mode == "sdpa":
+                out = self.sdpa_attention(q, k, v)  # (B, local_heads, seq_len, head_dim)
+            elif self.attention_mode == "sage":
+                out = self.sage_attention(q, k, v)  # (B, local_heads, seq_len, head_dim)
+            else:
+                raise ValueError(f"Unknown attention mode: {self.attention_mode}")
+            out = rearrange(out, "b h s d -> (b s) (h d)")
+        return out
+    def post_attention(
+        self,
+        out: torch.Tensor,
+        B: int,
+        M: int,
+        L: int,
+        dtype: torch.dtype,
+        valid_token_indices: torch.Tensor,
+    ):
+        """
+        Args:
+            out: (total <= B * (N + L), local_dim)
+            valid_token_indices: (total <= B * (N + L),)
+            B: Batch size
+            M: Number of visual tokens per context parallel rank
+            L: Number of text tokens
+            dtype: Data type of the input and output tensors
+        Returns:
+            x: (B, N, dim_x) tensor of visual tokens where N = M * cp_size
+            y: (B, L, dim_y) tensor of text token features
+        """
+        _, cp_size = cp.get_cp_rank_size()
+        local_heads = self.num_heads // cp_size
+        local_dim = local_heads * self.head_dim
+        N = M * cp_size
+        # Split sequence into visual and text tokens, adding back padding.
+        if B == 1:
+            out = out.view(B, -1, local_dim)
+            if out.size(1) > N:
+                x, y = torch.tensor_split(out, (N,), dim=1)  # (B, N, local_dim), (B, <= L, local_dim)
+                y = F.pad(y, (0, 0, 0, L - y.size(1)))  # (B, L, local_dim)
+            else:
+                # Empty prompt.
+                x, y = out, out.new_zeros(B, L, local_dim)
+        else:
+            x, y = pad_and_split_xy(out, valid_token_indices, B, N, L, dtype)
+        assert x.size() == (B, N, local_dim)
+        assert y.size() == (B, L, local_dim)
+        # Communicate across context parallel ranks.
+        x = x.view(B, N, local_heads, self.head_dim)
+        x = cp.all_to_all_collect_heads(x)  # (B, M, dim_x = num_heads * head_dim)
+        if cp.is_cp_active():
+            y = cp.all_gather(y)  # (cp_size * B, L, local_heads * head_dim)
+            y = rearrange(y, "(G B) L D -> B L (G D)", G=cp_size, D=local_dim)  # (B, L, dim_x)
+        x = self.proj_x(x)
+        y = self.proj_y(y)
+        return x, y
+    def forward(
+        self,
+        x: torch.Tensor,  # (B, M, dim_x)
+        y: torch.Tensor,  # (B, L, dim_y)
+        *,
+        scale_x: torch.Tensor,  # (B, dim_x), modulation for pre-RMSNorm.
+        scale_y: torch.Tensor,  # (B, dim_y), modulation for pre-RMSNorm.
+        packed_indices: Dict[str, torch.Tensor] = None,
+        checkpoint_qkv: bool = False,
+        checkpoint_post_attn: bool = False,
+        **rope_rotation,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Forward pass of asymmetric multi-modal attention.
+        Args:
+            x: (B, M, dim_x) tensor of visual tokens
+            y: (B, L, dim_y) tensor of text token features
+            packed_indices: Dict with keys for Flash Attention
+            num_frames: Number of frames in the video. N = num_frames * num_spatial_tokens
+        Returns:
+            x: (B, M, dim_x) tensor of visual tokens after multi-modal attention
+            y: (B, L, dim_y) tensor of text token features after multi-modal attention
+        """
+        B, L, _ = y.shape
+        _, M, _ = x.shape
+        # Predict a packed QKV tensor from visual and text features.
+        q, k, v = ck(self.prepare_qkv,
+            x=x,
+            y=y,
+            scale_x=scale_x,
+            scale_y=scale_y,
+            rope_cos=rope_rotation.get("rope_cos"),
+            rope_sin=rope_rotation.get("rope_sin"),
+            valid_token_indices=packed_indices["valid_token_indices_kv"],
+            max_seqlen_in_batch=packed_indices["max_seqlen_in_batch_kv"],
+            enabled=checkpoint_qkv,
+        )  # (total <= B * (N + L), 3, local_heads, head_dim)
+        # Self-attention is expensive, so don't checkpoint it.
+        out = self.run_attention(
+            q, k, v, B=B,
+            cu_seqlens=packed_indices["cu_seqlens_kv"],
+            max_seqlen_in_batch=packed_indices["max_seqlen_in_batch_kv"],
+        )
+        x, y = ck(self.post_attention,
+            out,
+            B=B, M=M, L=L,
+            dtype=v.dtype,
+            valid_token_indices=packed_indices["valid_token_indices_kv"],
+            enabled=checkpoint_post_attn,
+        )
+        return x, y
+@torch.compile(disable=not COMPILE_MMDIT_BLOCK)
+class AsymmetricJointBlock(nn.Module):
+    def __init__(
+        self,
+        hidden_size_x: int,
+        hidden_size_y: int,
+        num_heads: int,
+        *,
+        mlp_ratio_x: float = 8.0,  # Ratio of hidden size to d_model for MLP for visual tokens.
+        mlp_ratio_y: float = 4.0,  # Ratio of hidden size to d_model for MLP for text tokens.
+        update_y: bool = True,  # Whether to update text tokens in this block.
+        device: Optional[torch.device] = None,
+        **block_kwargs,
+    ):
+        super().__init__()
+        self.update_y = update_y
+        self.hidden_size_x = hidden_size_x
+        self.hidden_size_y = hidden_size_y
+        self.mod_x = nn.Linear(hidden_size_x, 4 * hidden_size_x, device=device)
+        if self.update_y:
+            self.mod_y = nn.Linear(hidden_size_x, 4 * hidden_size_y, device=device)
+        else:
+            self.mod_y = nn.Linear(hidden_size_x, hidden_size_y, device=device)
+        # Self-attention:
+        self.attn = AsymmetricAttention(
+            hidden_size_x,
+            hidden_size_y,
+            num_heads=num_heads,
+            update_y=update_y,
+            device=device,
+            **block_kwargs,
+        )
+        # MLP.
+        mlp_hidden_dim_x = int(hidden_size_x * mlp_ratio_x)
+        assert mlp_hidden_dim_x == int(1536 * 8)
+        self.mlp_x = FeedForward(
+            in_features=hidden_size_x,
+            hidden_size=mlp_hidden_dim_x,
+            multiple_of=256,
+            ffn_dim_multiplier=None,
+            device=device,
+        )
+        # MLP for text not needed in last block.
+        if self.update_y:
+            mlp_hidden_dim_y = int(hidden_size_y * mlp_ratio_y)
+            self.mlp_y = FeedForward(
+                in_features=hidden_size_y,
+                hidden_size=mlp_hidden_dim_y,
+                multiple_of=256,
+                ffn_dim_multiplier=None,
+                device=device,
+            )
+    def forward(
+        self,
+        x: torch.Tensor,
+        c: torch.Tensor,
+        y: torch.Tensor,
+        # TODO: These could probably just go into attn_kwargs
+        checkpoint_ff: bool = False,
+        checkpoint_qkv: bool = False,
+        checkpoint_post_attn: bool = False,
+        **attn_kwargs,
+    ):
+        """Forward pass of a block.
+        Args:
+            x: (B, N, dim) tensor of visual tokens
+            c: (B, dim) tensor of conditioned features
+            y: (B, L, dim) tensor of text tokens
+            num_frames: Number of frames in the video. N = num_frames * num_spatial_tokens
+        Returns:
+            x: (B, N, dim) tensor of visual tokens after block
+            y: (B, L, dim) tensor of text tokens after block
+        """
+        N = x.size(1)
+        c = F.silu(c)
+        mod_x = self.mod_x(c)
+        scale_msa_x, gate_msa_x, scale_mlp_x, gate_mlp_x = mod_x.chunk(4, dim=1)
+        mod_y = self.mod_y(c)
+        if self.update_y:
+            scale_msa_y, gate_msa_y, scale_mlp_y, gate_mlp_y = mod_y.chunk(4, dim=1)
+        else:
+            scale_msa_y = mod_y
+        # Self-attention block.
+        x_attn, y_attn = self.attn(
+            x,
+            y,
+            scale_x=scale_msa_x,
+            scale_y=scale_msa_y,
+            checkpoint_qkv=checkpoint_qkv,
+            checkpoint_post_attn=checkpoint_post_attn,
+            **attn_kwargs,
+        )
+        assert x_attn.size(1) == N
+        x = residual_tanh_gated_rmsnorm(x, x_attn, gate_msa_x)
+        if self.update_y:
+            y = residual_tanh_gated_rmsnorm(y, y_attn, gate_msa_y)
+        # MLP block.
+        x = ck(self.ff_block_x, x, scale_mlp_x, gate_mlp_x, enabled=checkpoint_ff)
+        if self.update_y:
+            y = ck(self.ff_block_y, y, scale_mlp_y, gate_mlp_y, enabled=checkpoint_ff)  # type: ignore
+        return x, y
+    def ff_block_x(self, x, scale_x, gate_x):
+        x_mod = modulated_rmsnorm(x, scale_x)
+        x_res = self.mlp_x(x_mod)
+        x = residual_tanh_gated_rmsnorm(x, x_res, gate_x)  # Sandwich norm
+        return x
+    def ff_block_y(self, y, scale_y, gate_y):
+        y_mod = modulated_rmsnorm(y, scale_y)
+        y_res = self.mlp_y(y_mod)
+        y = residual_tanh_gated_rmsnorm(y, y_res, gate_y)  # Sandwich norm
+        return y
+@torch.compile(disable=not COMPILE_FINAL_LAYER)
+class FinalLayer(nn.Module):
+    """
+    The final layer of DiT.
+    """
+    def __init__(
+        self,
+        hidden_size,
+        patch_size,
+        out_channels,
+        device: Optional[torch.device] = None,
+    ):
+        super().__init__()
+        self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, device=device)
+        self.mod = nn.Linear(hidden_size, 2 * hidden_size, device=device)
+        self.linear = nn.Linear(hidden_size, patch_size * patch_size * out_channels, device=device)
+    def forward(self, x, c):
+        c = F.silu(c)
+        shift, scale = self.mod(c).chunk(2, dim=1)
+        x = modulate(self.norm_final(x), shift, scale)
+        x = self.linear(x)
+        return x
+class AsymmDiTJoint(nn.Module):
+    """
+    Diffusion model with a Transformer backbone.
+    Ingests text embeddings instead of a label.
+    """
+    def __init__(
+        self,
+        *,
+        patch_size=2,
+        in_channels=4,
+        hidden_size_x=1152,
+        hidden_size_y=1152,
+        depth=48,
+        num_heads=16,
+        mlp_ratio_x=8.0,
+        mlp_ratio_y=4.0,
+        t5_feat_dim: int = 4096,
+        t5_token_length: int = 256,
+        patch_embed_bias: bool = True,
+        timestep_mlp_bias: bool = True,
+        timestep_scale: Optional[float] = None,
+        use_extended_posenc: bool = False,
+        rope_theta: float = 10000.0,
+        device: Optional[torch.device] = None,
+        **block_kwargs,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = in_channels
+        self.patch_size = patch_size
+        self.num_heads = num_heads
+        self.hidden_size_x = hidden_size_x
+        self.hidden_size_y = hidden_size_y
+        self.head_dim = hidden_size_x // num_heads  # Head dimension and count is determined by visual.
+        self.use_extended_posenc = use_extended_posenc
+        self.t5_token_length = t5_token_length
+        self.t5_feat_dim = t5_feat_dim
+        self.rope_theta = rope_theta  # Scaling factor for frequency computation for temporal RoPE.
+        self.x_embedder = PatchEmbed(
+            patch_size=patch_size,
+            in_chans=in_channels,
+            embed_dim=hidden_size_x,
+            bias=patch_embed_bias,
+            device=device,
+        )
+        # Conditionings
+        # Timestep
+        self.t_embedder = TimestepEmbedder(hidden_size_x, bias=timestep_mlp_bias, timestep_scale=timestep_scale)
+        # Caption Pooling (T5)
+        self.t5_y_embedder = AttentionPool(t5_feat_dim, num_heads=8, output_dim=hidden_size_x, device=device)
+        # Dense Embedding Projection (T5)
+        self.t5_yproj = nn.Linear(t5_feat_dim, hidden_size_y, bias=True, device=device)
+        # Initialize pos_frequencies as an empty parameter.
+        self.pos_frequencies = nn.Parameter(torch.empty(3, self.num_heads, self.head_dim // 2, device=device))
+        # for depth 48:
+        #  b =  0: AsymmetricJointBlock, update_y=True
+        #  b =  1: AsymmetricJointBlock, update_y=True
+        #  ...
+        #  b = 46: AsymmetricJointBlock, update_y=True
+        #  b = 47: AsymmetricJointBlock, update_y=False. No need to update text features.
+        blocks = []
+        for b in range(depth):
+            # Joint multi-modal block
+            update_y = b < depth - 1
+            block = AsymmetricJointBlock(
+                hidden_size_x,
+                hidden_size_y,
+                num_heads,
+                mlp_ratio_x=mlp_ratio_x,
+                mlp_ratio_y=mlp_ratio_y,
+                update_y=update_y,
+                device=device,
+                **block_kwargs,
+            )
+            blocks.append(block)
+        self.blocks = nn.ModuleList(blocks)
+        self.final_layer = FinalLayer(hidden_size_x, patch_size, self.out_channels, device=device)
+    def embed_x(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            x: (B, C=12, T, H, W) tensor of visual tokens
+        Returns:
+            x: (B, C=3072, N) tensor of visual tokens with positional embedding.
+        """
+        return self.x_embedder(x)  # Convert BcTHW to BCN
+    @torch.compile(disable=not COMPILE_MMDIT_BLOCK)
+    def prepare(
+        self,
+        x: torch.Tensor,
+        sigma: torch.Tensor,
+        t5_feat: torch.Tensor,
+        t5_mask: torch.Tensor,
+    ):
+        """Prepare input and conditioning embeddings."""
+        # Visual patch embeddings with positional encoding.
+        T, H, W = x.shape[-3:]
+        pH, pW = H // self.patch_size, W // self.patch_size
+        x = self.embed_x(x)  # (B, N, D), where N = T * H * W / patch_size ** 2
+        assert x.ndim == 3
+        B = x.size(0)
+        # Construct position array of size [N, 3].
+        # pos[:, 0] is the frame index for each location,
+        # pos[:, 1] is the row index for each location, and
+        # pos[:, 2] is the column index for each location.
+        N = T * pH * pW
+        assert x.size(1) == N
+        pos = create_position_matrix(T, pH=pH, pW=pW, device=x.device, dtype=torch.float32)  # (N, 3)
+        rope_cos, rope_sin = compute_mixed_rotation(
+            freqs=self.pos_frequencies, pos=pos
+        )  # Each are (N, num_heads, dim // 2)
+        # Global vector embedding for conditionings.
+        c_t = self.t_embedder(1 - sigma)  # (B, D)
+        # Pool T5 tokens using attention pooler
+        # Note y_feat[1] contains T5 token features.
+        assert (
+            t5_feat.size(1) == self.t5_token_length
+        ), f"Expected L={self.t5_token_length}, got {t5_feat.shape} for y_feat."
+        t5_y_pool = self.t5_y_embedder(t5_feat, t5_mask)  # (B, D)
+        assert t5_y_pool.size(0) == B, f"Expected B={B}, got {t5_y_pool.shape} for t5_y_pool."
+        c = c_t + t5_y_pool
+        y_feat = self.t5_yproj(t5_feat)  # (B, L, t5_feat_dim) --> (B, L, D)
+        return x, c, y_feat, rope_cos, rope_sin
+    def forward(
+        self,
+        x: torch.Tensor,
+        sigma: torch.Tensor,
+        y_feat: List[torch.Tensor],
+        y_mask: List[torch.Tensor],
+        packed_indices: Dict[str, torch.Tensor] = None,
+        rope_cos: torch.Tensor = None,
+        rope_sin: torch.Tensor = None,
+        num_ff_checkpoint: int = 0,
+        num_qkv_checkpoint: int = 0,
+        num_post_attn_checkpoint: int = 0,
+    ):
+        """Forward pass of DiT.
+        Args:
+            x: (B, C, T, H, W) tensor of spatial inputs (images or latent representations of images)
+            sigma: (B,) tensor of noise standard deviations
+            y_feat: List((B, L, y_feat_dim) tensor of caption token features. For SDXL text encoders: L=77, y_feat_dim=2048)
+            y_mask: List((B, L) boolean tensor indicating which tokens are not padding)
+            packed_indices: Dict with keys for Flash Attention. Result of compute_packed_indices.
+        """
+        _, _, T, H, W = x.shape
+        if self.pos_frequencies.dtype != torch.float32:
+            warnings.warn(f"pos_frequencies dtype {self.pos_frequencies.dtype} != torch.float32")
+        # Use EFFICIENT_ATTENTION backend for T5 pooling, since we have a mask.
+        # Have to call sdpa_kernel outside of a torch.compile region.
+        with sdpa_kernel(torch.nn.attention.SDPBackend.EFFICIENT_ATTENTION):
+            x, c, y_feat, rope_cos, rope_sin = self.prepare(x, sigma, y_feat[0], y_mask[0])
+        del y_mask
+        cp_rank, cp_size = cp.get_cp_rank_size()
+        N = x.size(1)
+        M = N // cp_size
+        assert N % cp_size == 0, f"Visual sequence length ({x.shape[1]}) must be divisible by cp_size ({cp_size})."
+        if cp_size > 1:
+            x = x.narrow(1, cp_rank * M, M)
+            assert self.num_heads % cp_size == 0
+            local_heads = self.num_heads // cp_size
+            rope_cos = rope_cos.narrow(1, cp_rank * local_heads, local_heads)
+            rope_sin = rope_sin.narrow(1, cp_rank * local_heads, local_heads)
+        for i, block in enumerate(self.blocks):
+            x, y_feat = block(
+                x,
+                c,
+                y_feat,
+                rope_cos=rope_cos,
+                rope_sin=rope_sin,
+                packed_indices=packed_indices,
+                checkpoint_ff=i < num_ff_checkpoint,
+                checkpoint_qkv=i < num_qkv_checkpoint,
+                checkpoint_post_attn=i < num_post_attn_checkpoint,
+            )  # (B, M, D), (B, L, D)
+        del y_feat  # Final layers don't use dense text features.
+        x = self.final_layer(x, c)  # (B, M, patch_size ** 2 * out_channels)
+        patch = x.size(2)
+        x = cp.all_gather(x)
+        x = rearrange(x, "(G B) M P -> B (G M) P", G=cp_size, P=patch)
+        x = rearrange(
+            x,
+            "B (T hp wp) (p1 p2 c) -> B c T (hp p1) (wp p2)",
+            T=T,
+            hp=H // self.patch_size,
+            wp=W // self.patch_size,
+            p1=self.patch_size,
+            p2=self.patch_size,
+            c=self.out_channels,
+        )
+        return x

src/genmo/mochi_preview/dit/joint_model/context_parallel.py ADDED Viewed

	@@ -0,0 +1,158 @@

+from typing import Tuple
+import torch
+import torch.distributed as dist
+from einops import rearrange
+_CONTEXT_PARALLEL_GROUP = None
+_CONTEXT_PARALLEL_RANK = None
+_CONTEXT_PARALLEL_GROUP_SIZE = None
+_CONTEXT_PARALLEL_GROUP_RANKS = None
+def get_cp_rank_size() -> Tuple[int, int]:
+    if _CONTEXT_PARALLEL_GROUP:
+        assert isinstance(_CONTEXT_PARALLEL_RANK, int) and isinstance(_CONTEXT_PARALLEL_GROUP_SIZE, int)
+        return _CONTEXT_PARALLEL_RANK, _CONTEXT_PARALLEL_GROUP_SIZE
+    else:
+        return 0, 1
+def local_shard(x: torch.Tensor, dim: int = 2) -> torch.Tensor:
+    if not _CONTEXT_PARALLEL_GROUP:
+        return x
+    cp_rank, cp_size = get_cp_rank_size()
+    return x.tensor_split(cp_size, dim=dim)[cp_rank]
+def set_cp_group(cp_group, ranks, global_rank):
+    global _CONTEXT_PARALLEL_GROUP, _CONTEXT_PARALLEL_RANK, _CONTEXT_PARALLEL_GROUP_SIZE, _CONTEXT_PARALLEL_GROUP_RANKS
+    if _CONTEXT_PARALLEL_GROUP is not None:
+        raise RuntimeError("CP group already initialized.")
+    _CONTEXT_PARALLEL_GROUP = cp_group
+    _CONTEXT_PARALLEL_RANK = dist.get_rank(cp_group)
+    _CONTEXT_PARALLEL_GROUP_SIZE = dist.get_world_size(cp_group)
+    _CONTEXT_PARALLEL_GROUP_RANKS = ranks
+    assert _CONTEXT_PARALLEL_RANK == ranks.index(
+        global_rank
+    ), f"Rank mismatch: {global_rank} in {ranks} does not have position {_CONTEXT_PARALLEL_RANK} "
+    assert _CONTEXT_PARALLEL_GROUP_SIZE == len(
+        ranks
+    ), f"Group size mismatch: {_CONTEXT_PARALLEL_GROUP_SIZE} != len({ranks})"
+def get_cp_group():
+    if _CONTEXT_PARALLEL_GROUP is None:
+        raise RuntimeError("CP group not initialized")
+    return _CONTEXT_PARALLEL_GROUP
+def is_cp_active():
+    return _CONTEXT_PARALLEL_GROUP is not None
+class AllGatherIntoTensorFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, x: torch.Tensor, reduce_dtype, group: dist.ProcessGroup):
+        ctx.reduce_dtype = reduce_dtype
+        ctx.group = group
+        ctx.batch_size = x.size(0)
+        group_size = dist.get_world_size(group)
+        x = x.contiguous()
+        output = torch.empty(group_size * x.size(0), *x.shape[1:], dtype=x.dtype, device=x.device)
+        dist.all_gather_into_tensor(output, x, group=group)
+        return output
+def all_gather(tensor: torch.Tensor) -> torch.Tensor:
+    if not _CONTEXT_PARALLEL_GROUP:
+        return tensor
+    return AllGatherIntoTensorFunction.apply(tensor, torch.float32, _CONTEXT_PARALLEL_GROUP)
+@torch.compiler.disable()
+def _all_to_all_single(output, input, group):
+    # Disable compilation since torch compile changes contiguity.
+    assert input.is_contiguous(), "Input tensor must be contiguous."
+    assert output.is_contiguous(), "Output tensor must be contiguous."
+    return dist.all_to_all_single(output, input, group=group)
+class CollectTokens(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, qkv: torch.Tensor, group: dist.ProcessGroup, num_heads: int):
+        """Redistribute heads and receive tokens.
+        Args:
+            qkv: query, key or value. Shape: [B, M, 3 * num_heads * head_dim]
+        Returns:
+            qkv: shape: [3, B, N, local_heads, head_dim]
+        where M is the number of local tokens,
+        N = cp_size * M is the number of global tokens,
+        local_heads = num_heads // cp_size is the number of local heads.
+        """
+        ctx.group = group
+        ctx.num_heads = num_heads
+        cp_size = dist.get_world_size(group)
+        assert num_heads % cp_size == 0
+        ctx.local_heads = num_heads // cp_size
+        qkv = rearrange(
+            qkv,
+            "B M (qkv G h d) -> G M h B (qkv d)",
+            qkv=3,
+            G=cp_size,
+            h=ctx.local_heads,
+        ).contiguous()
+        output_chunks = torch.empty_like(qkv)
+        _all_to_all_single(output_chunks, qkv, group=group)
+        return rearrange(output_chunks, "G M h B (qkv d) -> qkv B (G M) h d", qkv=3)
+def all_to_all_collect_tokens(x: torch.Tensor, num_heads: int) -> torch.Tensor:
+    if not _CONTEXT_PARALLEL_GROUP:
+        # Move QKV dimension to the front.
+        #   B M (3 H d) -> 3 B M H d
+        B, M, _ = x.size()
+        x = x.view(B, M, 3, num_heads, -1)
+        return x.permute(2, 0, 1, 3, 4)
+    return CollectTokens.apply(x, _CONTEXT_PARALLEL_GROUP, num_heads)
+class CollectHeads(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, x: torch.Tensor, group: dist.ProcessGroup):
+        """Redistribute tokens and receive heads.
+        Args:
+            x: Output of attention. Shape: [B, N, local_heads, head_dim]
+        Returns:
+            Shape: [B, M, num_heads * head_dim]
+        """
+        ctx.group = group
+        ctx.local_heads = x.size(2)
+        ctx.head_dim = x.size(3)
+        group_size = dist.get_world_size(group)
+        x = rearrange(x, "B (G M) h D -> G h M B D", G=group_size).contiguous()
+        output = torch.empty_like(x)
+        _all_to_all_single(output, x, group=group)
+        del x
+        return rearrange(output, "G h M B D -> B M (G h D)")
+def all_to_all_collect_heads(x: torch.Tensor) -> torch.Tensor:
+    if not _CONTEXT_PARALLEL_GROUP:
+        # Merge heads.
+        return x.view(x.size(0), x.size(1), x.size(2) * x.size(3))
+    return CollectHeads.apply(x, _CONTEXT_PARALLEL_GROUP)

src/genmo/mochi_preview/dit/joint_model/layers.py ADDED Viewed

	@@ -0,0 +1,179 @@

+import collections.abc
+import math
+from itertools import repeat
+from typing import Callable, Optional
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+# From PyTorch internals
+def _ntuple(n):
+    def parse(x):
+        if isinstance(x, collections.abc.Iterable) and not isinstance(x, str):
+            return tuple(x)
+        return tuple(repeat(x, n))
+    return parse
+to_2tuple = _ntuple(2)
+class TimestepEmbedder(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        frequency_embedding_size: int = 256,
+        *,
+        bias: bool = True,
+        timestep_scale: Optional[float] = None,
+        device: Optional[torch.device] = None,
+    ):
+        super().__init__()
+        self.mlp = nn.Sequential(
+            nn.Linear(frequency_embedding_size, hidden_size, bias=bias, device=device),
+            nn.SiLU(),
+            nn.Linear(hidden_size, hidden_size, bias=bias, device=device),
+        )
+        self.frequency_embedding_size = frequency_embedding_size
+        self.timestep_scale = timestep_scale
+    @staticmethod
+    def timestep_embedding(t, dim, max_period=10000):
+        half = dim // 2
+        freqs = torch.arange(start=0, end=half, dtype=torch.float32, device=t.device)
+        freqs.mul_(-math.log(max_period) / half).exp_()
+        args = t[:, None].float() * freqs[None]
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if dim % 2:
+            embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+        return embedding
+    def forward(self, t):
+        if self.timestep_scale is not None:
+            t = t * self.timestep_scale
+        t_freq = self.timestep_embedding(t, self.frequency_embedding_size)
+        t_emb = self.mlp(t_freq)
+        return t_emb
+class PooledCaptionEmbedder(nn.Module):
+    def __init__(
+        self,
+        caption_feature_dim: int,
+        hidden_size: int,
+        *,
+        bias: bool = True,
+        device: Optional[torch.device] = None,
+    ):
+        super().__init__()
+        self.caption_feature_dim = caption_feature_dim
+        self.hidden_size = hidden_size
+        self.mlp = nn.Sequential(
+            nn.Linear(caption_feature_dim, hidden_size, bias=bias, device=device),
+            nn.SiLU(),
+            nn.Linear(hidden_size, hidden_size, bias=bias, device=device),
+        )
+    def forward(self, x):
+        return self.mlp(x)
+class FeedForward(nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_size: int,
+        multiple_of: int,
+        ffn_dim_multiplier: Optional[float],
+        device: Optional[torch.device] = None,
+    ):
+        super().__init__()
+        # keep parameter count and computation constant compared to standard FFN
+        hidden_size = int(2 * hidden_size / 3)
+        # custom dim factor multiplier
+        if ffn_dim_multiplier is not None:
+            hidden_size = int(ffn_dim_multiplier * hidden_size)
+        hidden_size = multiple_of * ((hidden_size + multiple_of - 1) // multiple_of)
+        self.hidden_dim = hidden_size
+        self.w1 = nn.Linear(in_features, 2 * hidden_size, bias=False, device=device)
+        self.w2 = nn.Linear(hidden_size, in_features, bias=False, device=device)
+    def forward(self, x):
+        # assert self.w1.weight.dtype == torch.bfloat16, f"FFN weight dtype {self.w1.weight.dtype} != bfloat16"
+        x, gate = self.w1(x).chunk(2, dim=-1)
+        x = self.w2(F.silu(x) * gate)
+        return x
+class PatchEmbed(nn.Module):
+    def __init__(
+        self,
+        patch_size: int = 16,
+        in_chans: int = 3,
+        embed_dim: int = 768,
+        norm_layer: Optional[Callable] = None,
+        flatten: bool = True,
+        bias: bool = True,
+        dynamic_img_pad: bool = False,
+        device: Optional[torch.device] = None,
+    ):
+        super().__init__()
+        self.patch_size = to_2tuple(patch_size)
+        self.flatten = flatten
+        self.dynamic_img_pad = dynamic_img_pad
+        self.proj = nn.Conv2d(
+            in_chans,
+            embed_dim,
+            kernel_size=patch_size,
+            stride=patch_size,
+            bias=bias,
+            device=device,
+        )
+        assert norm_layer is None
+        self.norm = norm_layer(embed_dim, device=device) if norm_layer else nn.Identity()
+    def forward(self, x):
+        B, _C, T, H, W = x.shape
+        if not self.dynamic_img_pad:
+            assert (
+                H % self.patch_size[0] == 0
+            ), f"Input height ({H}) should be divisible by patch size ({self.patch_size[0]})."
+            assert (
+                W % self.patch_size[1] == 0
+            ), f"Input width ({W}) should be divisible by patch size ({self.patch_size[1]})."
+        else:
+            pad_h = (self.patch_size[0] - H % self.patch_size[0]) % self.patch_size[0]
+            pad_w = (self.patch_size[1] - W % self.patch_size[1]) % self.patch_size[1]
+            x = F.pad(x, (0, pad_w, 0, pad_h))
+        x = rearrange(x, "B C T H W -> (B T) C H W", B=B, T=T)
+        x = self.proj(x)
+        # Flatten temporal and spatial dimensions.
+        if not self.flatten:
+            raise NotImplementedError("Must flatten output.")
+        x = rearrange(x, "(B T) C H W -> B (T H W) C", B=B, T=T)
+        x = self.norm(x)
+        return x
+class RMSNorm(torch.nn.Module):
+    def __init__(self, hidden_size, eps=1e-5, device=None):
+        super().__init__()
+        self.eps = eps
+        self.weight = torch.nn.Parameter(torch.empty(hidden_size, device=device))
+        self.register_parameter("bias", None)
+    def forward(self, x):
+        # assert self.weight.dtype == torch.float32, f"RMSNorm weight dtype {self.weight.dtype} != float32"
+        x_fp32 = x.float()
+        x_normed = x_fp32 * torch.rsqrt(x_fp32.pow(2).mean(-1, keepdim=True) + self.eps)
+        return (x_normed * self.weight).type_as(x)

src/genmo/mochi_preview/dit/joint_model/lora.py ADDED Viewed

	@@ -0,0 +1,112 @@

+#! /usr/bin/env python3
+import math
+from typing import Dict, List, Optional
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class LoRALayer:
+    def __init__(
+        self,
+        r: int,
+        lora_alpha: int,
+        lora_dropout: float,
+        merge_weights: bool,
+    ):
+        self.r = r
+        self.lora_alpha = lora_alpha
+        if lora_dropout > 0.0:
+            self.lora_dropout = nn.Dropout(p=lora_dropout)
+        else:
+            self.lora_dropout = lambda x: x
+        self.merged = False
+        self.merge_weights = merge_weights
+def mark_only_lora_as_trainable(model: nn.Module, bias: str = "none") -> None:
+    assert bias == "none", f"Only bias='none' is supported"
+    for n, p in model.named_parameters():
+        if "lora_" not in n:
+            p.requires_grad = False
+def lora_state_dict(model: nn.Module, bias: str = "none") -> Dict[str, torch.Tensor]:
+    assert bias == "none", f"Only bias='none' is supported"
+    my_state_dict = model.state_dict()
+    return {k: my_state_dict[k] for k in my_state_dict if "lora_" in k}
+class LoraLinear(nn.Linear, LoRALayer):
+    # LoRA implemented in a dense layer
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        r: int = 0,
+        lora_alpha: int = 1,
+        lora_dropout: float = 0.0,
+        fan_in_fan_out: bool = False,  # Set this to True if the layer to replace stores weight like (fan_in, fan_out)
+        merge_weights: bool = True,
+        **kwargs,
+    ):
+        nn.Linear.__init__(self, in_features, out_features, **kwargs)
+        LoRALayer.__init__(self, r=r, lora_alpha=lora_alpha, lora_dropout=lora_dropout, merge_weights=merge_weights)
+        self.fan_in_fan_out = fan_in_fan_out
+        # Actual trainable parameters
+        if r > 0:
+            self.lora_A = nn.Parameter(self.weight.new_zeros((r, in_features)).to(torch.float32))
+            self.lora_B = nn.Parameter(self.weight.new_zeros((out_features, r)).to(torch.float32))
+            self.scaling = self.lora_alpha / self.r
+            # Freezing the pre-trained weight matrix
+            self.weight.requires_grad = False
+        self.reset_parameters()
+        if fan_in_fan_out:
+            self.weight.data = self.weight.data.transpose(0, 1)
+    def reset_parameters(self):
+        nn.Linear.reset_parameters(self)
+        if hasattr(self, "lora_A"):
+            # initialize B the same way as the default for nn.Linear and A to zero
+            # this is different than what is described in the paper but should not affect performance
+            nn.init.kaiming_uniform_(self.lora_A, a=math.sqrt(5))
+            nn.init.zeros_(self.lora_B)
+    def train(self, mode: bool = True):
+        def T(w):
+            return w.transpose(0, 1) if self.fan_in_fan_out else w
+        nn.Linear.train(self, mode)
+        if mode:
+            if self.merge_weights and self.merged:
+                # Make sure that the weights are not merged
+                if self.r > 0:
+                    self.weight.data -= T(self.lora_B @ self.lora_A) * self.scaling
+                self.merged = False
+        else:
+            if self.merge_weights and not self.merged:
+                # Merge the weights and mark it
+                if self.r > 0:
+                    self.weight.data += T(self.lora_B @ self.lora_A) * self.scaling
+                self.merged = True
+    def forward(self, x: torch.Tensor):
+        def T(w):
+            return w.transpose(0, 1) if self.fan_in_fan_out else w
+        if self.r > 0 and not self.merged:
+            result = F.linear(x, T(self.weight), bias=self.bias)
+            x = self.lora_dropout(x)
+            x = x @ self.lora_A.transpose(0, 1)
+            x = x @ self.lora_B.transpose(0, 1)
+            x = x * self.scaling
+            return result + x
+        else:
+            return F.linear(x, T(self.weight), bias=self.bias)

src/genmo/mochi_preview/dit/joint_model/mod_rmsnorm.py ADDED Viewed

	@@ -0,0 +1,15 @@

+import torch
+def modulated_rmsnorm(x, scale, eps=1e-6):
+    dtype = x.dtype
+    x = x.float()
+    # Compute RMS
+    mean_square = x.pow(2).mean(-1, keepdim=True)
+    inv_rms = torch.rsqrt(mean_square + eps)
+    # Normalize and modulate
+    x_normed = x * inv_rms
+    x_modulated = x_normed * (1 + scale.unsqueeze(1).float())
+    return x_modulated.to(dtype)

src/genmo/mochi_preview/dit/joint_model/residual_tanh_gated_rmsnorm.py ADDED Viewed

	@@ -0,0 +1,20 @@

+import torch
+def residual_tanh_gated_rmsnorm(x, x_res, gate, eps=1e-6):
+    # Convert to fp32 for precision
+    x_res = x_res.float()
+    # Compute RMS
+    mean_square = x_res.pow(2).mean(-1, keepdim=True)
+    scale = torch.rsqrt(mean_square + eps)
+    # Apply tanh to gate
+    tanh_gate = torch.tanh(gate).unsqueeze(1)
+    # Normalize and apply gated scaling
+    x_normed = x_res * scale * tanh_gate
+    # Apply residual connection
+    output = x + x_normed.type_as(x)
+    return output

src/genmo/mochi_preview/dit/joint_model/rope_mixed.py ADDED Viewed

	@@ -0,0 +1,88 @@

+import functools
+import math
+import torch
+def centers(start: float, stop, num, dtype=None, device=None):
+    """linspace through bin centers.
+    Args:
+        start (float): Start of the range.
+        stop (float): End of the range.
+        num (int): Number of points.
+        dtype (torch.dtype): Data type of the points.
+        device (torch.device): Device of the points.
+    Returns:
+        centers (Tensor): Centers of the bins. Shape: (num,).
+    """
+    edges = torch.linspace(start, stop, num + 1, dtype=dtype, device=device)
+    return (edges[:-1] + edges[1:]) / 2
+@functools.lru_cache(maxsize=1)
+def create_position_matrix(
+    T: int,
+    pH: int,
+    pW: int,
+    device: torch.device,
+    dtype: torch.dtype,
+    *,
+    target_area: float = 36864,
+):
+    """
+    Args:
+        T: int - Temporal dimension
+        pH: int - Height dimension after patchify
+        pW: int - Width dimension after patchify
+    Returns:
+        pos: [T * pH * pW, 3] - position matrix
+    """
+    with torch.no_grad():
+        # Create 1D tensors for each dimension
+        t = torch.arange(T, dtype=dtype)
+        # Positionally interpolate to area 36864.
+        # (3072x3072 frame with 16x16 patches = 192x192 latents).
+        # This automatically scales rope positions when the resolution changes.
+        # We use a large target area so the model is more sensitive
+        # to changes in the learned pos_frequencies matrix.
+        scale = math.sqrt(target_area / (pW * pH))
+        w = centers(-pW * scale / 2, pW * scale / 2, pW)
+        h = centers(-pH * scale / 2, pH * scale / 2, pH)
+        # Use meshgrid to create 3D grids
+        grid_t, grid_h, grid_w = torch.meshgrid(t, h, w, indexing="ij")
+        # Stack and reshape the grids.
+        pos = torch.stack([grid_t, grid_h, grid_w], dim=-1)  # [T, pH, pW, 3]
+        pos = pos.view(-1, 3)  # [T * pH * pW, 3]
+        pos = pos.to(dtype=dtype, device=device)
+    return pos
+def compute_mixed_rotation(
+    freqs: torch.Tensor,
+    pos: torch.Tensor,
+):
+    """
+    Project each 3-dim position into per-head, per-head-dim 1D frequencies.
+    Args:
+        freqs: [3, num_heads, num_freqs] - learned rotation frequency (for t, row, col) for each head position
+        pos: [N, 3] - position of each token
+        num_heads: int
+    Returns:
+        freqs_cos: [N, num_heads, num_freqs] - cosine components
+        freqs_sin: [N, num_heads, num_freqs] - sine components
+    """
+    with torch.autocast("cuda", enabled=False):
+        assert freqs.ndim == 3
+        freqs_sum = torch.einsum("Nd,dhf->Nhf", pos.to(freqs), freqs)
+        freqs_cos = torch.cos(freqs_sum)
+        freqs_sin = torch.sin(freqs_sum)
+    return freqs_cos, freqs_sin

src/genmo/mochi_preview/dit/joint_model/temporal_rope.py ADDED Viewed

	@@ -0,0 +1,34 @@

+# Based on Llama3 Implementation.
+import torch
+def apply_rotary_emb_qk_real(
+    xqk: torch.Tensor,
+    freqs_cos: torch.Tensor,
+    freqs_sin: torch.Tensor,
+) -> torch.Tensor:
+    """
+    Apply rotary embeddings to input tensors using the given frequency tensor without complex numbers.
+    Args:
+        xqk (torch.Tensor): Query and/or Key tensors to apply rotary embeddings. Shape: (B, S, *, num_heads, D)
+                            Can be either just query or just key, or both stacked along some batch or * dim.
+        freqs_cos (torch.Tensor): Precomputed cosine frequency tensor.
+        freqs_sin (torch.Tensor): Precomputed sine frequency tensor.
+    Returns:
+        torch.Tensor: The input tensor with rotary embeddings applied.
+    """
+    assert xqk.dtype == torch.bfloat16
+    # Split the last dimension into even and odd parts
+    xqk_even = xqk[..., 0::2]
+    xqk_odd = xqk[..., 1::2]
+    # Apply rotation
+    cos_part = (xqk_even * freqs_cos - xqk_odd * freqs_sin).type_as(xqk)
+    sin_part = (xqk_even * freqs_sin + xqk_odd * freqs_cos).type_as(xqk)
+    # Interleave the results back into the original shape
+    out = torch.stack([cos_part, sin_part], dim=-1).flatten(-2)
+    assert out.dtype == torch.bfloat16
+    return out

src/genmo/mochi_preview/dit/joint_model/utils.py ADDED Viewed

	@@ -0,0 +1,109 @@

+from typing import Optional, Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+def modulate(x, shift, scale):
+    return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
+def pool_tokens(x: torch.Tensor, mask: torch.Tensor, *, keepdim=False) -> torch.Tensor:
+    """
+    Pool tokens in x using mask.
+    NOTE: We assume x does not require gradients.
+    Args:
+        x: (B, L, D) tensor of tokens.
+        mask: (B, L) boolean tensor indicating which tokens are not padding.
+    Returns:
+        pooled: (B, D) tensor of pooled tokens.
+    """
+    assert x.size(1) == mask.size(1)  # Expected mask to have same length as tokens.
+    assert x.size(0) == mask.size(0)  # Expected mask to have same batch size as tokens.
+    mask = mask[:, :, None].to(dtype=x.dtype)
+    mask = mask / mask.sum(dim=1, keepdim=True).clamp(min=1)
+    pooled = (x * mask).sum(dim=1, keepdim=keepdim)
+    return pooled
+class AttentionPool(nn.Module):
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        output_dim: int = None,
+        device: Optional[torch.device] = None,
+    ):
+        """
+        Args:
+            spatial_dim (int): Number of tokens in sequence length.
+            embed_dim (int): Dimensionality of input tokens.
+            num_heads (int): Number of attention heads.
+            output_dim (int): Dimensionality of output tokens. Defaults to embed_dim.
+        """
+        super().__init__()
+        self.num_heads = num_heads
+        self.to_kv = nn.Linear(embed_dim, 2 * embed_dim, device=device)
+        self.to_q = nn.Linear(embed_dim, embed_dim, device=device)
+        self.to_out = nn.Linear(embed_dim, output_dim or embed_dim, device=device)
+    def forward(self, x, mask):
+        """
+        Args:
+            x (torch.Tensor): (B, L, D) tensor of input tokens.
+            mask (torch.Tensor): (B, L) boolean tensor indicating which tokens are not padding.
+        NOTE: We assume x does not require gradients.
+        Returns:
+            x (torch.Tensor): (B, D) tensor of pooled tokens.
+        """
+        D = x.size(2)
+        # Construct attention mask, shape: (B, 1, num_queries=1, num_keys=1+L).
+        attn_mask = mask[:, None, None, :].bool()  # (B, 1, 1, L).
+        attn_mask = F.pad(attn_mask, (1, 0), value=True)  # (B, 1, 1, 1+L).
+        # Average non-padding token features. These will be used as the query.
+        x_pool = pool_tokens(x, mask, keepdim=True)  # (B, 1, D)
+        # Concat pooled features to input sequence.
+        x = torch.cat([x_pool, x], dim=1)  # (B, L+1, D)
+        # Compute queries, keys, values. Only the mean token is used to create a query.
+        kv = self.to_kv(x)  # (B, L+1, 2 * D)
+        q = self.to_q(x[:, 0])  # (B, D)
+        # Extract heads.
+        head_dim = D // self.num_heads
+        kv = kv.unflatten(2, (2, self.num_heads, head_dim))  # (B, 1+L, 2, H, head_dim)
+        kv = kv.transpose(1, 3)  # (B, H, 2, 1+L, head_dim)
+        k, v = kv.unbind(2)  # (B, H, 1+L, head_dim)
+        q = q.unflatten(1, (self.num_heads, head_dim))  # (B, H, head_dim)
+        q = q.unsqueeze(2)  # (B, H, 1, head_dim)
+        # Compute attention.
+        x = F.scaled_dot_product_attention(q, k, v, attn_mask=attn_mask, dropout_p=0.0)  # (B, H, 1, head_dim)
+        # Concatenate heads and run output.
+        x = x.squeeze(2).flatten(1, 2)  # (B, D = H * head_dim)
+        x = self.to_out(x)
+        return x
+def pad_and_split_xy(xy, indices, B, N, L, dtype) -> Tuple[torch.Tensor, torch.Tensor]:
+    D = xy.size(1)
+    # Pad sequences to (B, N + L, dim).
+    assert indices.ndim == 1
+    indices = indices.unsqueeze(1).expand(-1, D)  # (total,) -> (total, num_heads * head_dim)
+    output = torch.zeros(B * (N + L), D, device=xy.device, dtype=dtype)
+    output = torch.scatter(output, 0, indices, xy)
+    xy = output.view(B, N + L, D)
+    # Split visual and text tokens along the sequence length.
+    return torch.tensor_split(xy, (N,), dim=1)