jbilcke-hf
/

LTX-Video-2b-0-9-8-distilled-HFIE

Diffusers

Safetensors

LTXPipeline

Model card Files Files and versions

xet

Community

jbilcke-hf HF Staff commited on Jul 21

Commit

9985ce2

verified ·

1 Parent(s): 0bafb17

Update handler_LAST_WORKING.py

Browse files

Files changed (1) hide show

handler_LAST_WORKING.py +506 -344

handler_LAST_WORKING.py CHANGED Viewed

@@ -1,24 +1,23 @@
 from dataclasses import dataclass
 from pathlib import Path
-import pathlib
-from typing import Dict, Any, Optional, Tuple
-import asyncio
-import base64
-import io
-import pprint
 import logging
 import random
-import traceback
 import os
 import numpy as np
 import torch
-import gc
-from diffusers import AutoencoderKLLTXVideo, LTXPipeline, LTXImageToVideoPipeline, LTXVideoTransformer3DModel
-#from diffusers.hooks import apply_enhance_a_video, EnhanceAVideoConfig
-#from teacache import apply_teacache
-from PIL import Image
 from varnish import Varnish
 from varnish.utils import is_truthy, process_input_image
@@ -27,14 +26,13 @@ from varnish.utils import is_truthy, process_input_image
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 # Get token from environment
 hf_token = os.getenv("HF_API_TOKEN")
 # Constraints
 MAX_LARGE_SIDE = 1280
-MAX_SMALL_SIDE = 768 # should be 720 but it must be divisible by 32
-MAX_FRAMES = 257
 # Check environment variable for pipeline support
 support_image_prompt = is_truthy(os.getenv("SUPPORT_INPUT_IMAGE_PROMPT"))
@@ -48,10 +46,8 @@ class GenerationConfig:
     negative_prompt: str = "saturated, highlight, overexposed, highlighted, overlit, shaking, too bright, worst quality, inconsistent motion, blurry, jittery, distorted, cropped, watermarked, watermark, logo, subtitle, subtitles, lowres"
     # video model settings (will be used during generation of the initial raw video clip)
-    # we use small values to make things a bit faster
-    width: int = 768
-    height: int = 416
     # this is a hack to fool LTX-Video into believing our input image is an actual video frame with poor encoding quality
     # after a quick benchmark using the value 70 seems like a sweet spot
@@ -62,8 +58,8 @@ class GenerationConfig:
     # visual glitches appear after about 169 frames, so we don't need more actually
     num_frames: int = (8 * 14) + 1
-    # with the distilled model, a guidance scale of 1.0 is fine
-    guidance_scale: float = 1.0
     num_inference_steps: int = 8
@@ -71,16 +67,16 @@ class GenerationConfig:
     seed: int = -1  # -1 means random seed
     # varnish settings (will be used for post-processing after the raw video clip has been generated
-    fps: int = 30 # FPS of the final video (only applied at the the very end, when converting to mp4)
-    double_num_frames: bool = False # if True, the number of frames will be multiplied by 2 using RIFE
-    super_resolution: bool = False # if True, the resolution will be multiplied by 2 using Real_ESRGAN
-    grain_amount: float = 0.0 # be careful, adding film grian can negatively impact video compression
     # audio settings
     enable_audio: bool = False  # Whether to generate audio
     audio_prompt: str = ""  # Text prompt for audio generation
-    audio_negative_prompt: str = "voices, voice, talking, speaking, speech" # Negative prompt for audio generation
     # The range of the CRF scale is 0–51, where:
     # 0 is lossless (for 8 bit only, for 10 bit use -qp 0)
@@ -92,18 +88,26 @@ class GenerationConfig:
     # The range is exponential, so increasing the CRF value +6 results in roughly half the bitrate / file size, while -6 leads to roughly twice the bitrate.
     quality: int = 18
-    # TeaCache settings
-    enable_teacache: bool = False
-    teacache_threshold: float = 0.05 # values: 0 (original), 0.03 (1.6x speedup), 0.05 (2.1x speedup).
-    # Enhance-A-Video settings
-    enable_enhance_a_video: bool = False
-    enhance_a_video_weight: float = 5.0
-    # LoRA settings
-    lora_model_name: str = ""  # HuggingFace repo ID or path to LoRA model
-    lora_model_weight_file: str = ""  # Specific weight file to load from the LoRA model
-    lora_model_trigger: str = ""  # Optional trigger word to prepend to the prompt
     def validate_and_adjust(self) -> 'GenerationConfig':
         """Validate and adjust parameters to meet constraints"""
@@ -111,7 +115,7 @@ class GenerationConfig:
         if not ((self.width == MAX_LARGE_SIDE and self.height == MAX_SMALL_SIDE) or
                 (self.width == MAX_SMALL_SIDE and self.height == MAX_LARGE_SIDE)):
             # For other resolutions, ensure total pixels don't exceed max
-            MAX_TOTAL_PIXELS = MAX_SMALL_SIDE * MAX_LARGE_SIDE # or 921600 = 1280 * 720
             # If total pixels exceed maximum, scale down proportionally
             total_pixels = self.width * self.height
@@ -131,371 +135,527 @@ class GenerationConfig:
         # Set random seed if not specified
         if self.seed == -1:
             self.seed = random.randint(0, 2**32 - 1)
-        return self
-class EndpointHandler:
-    """Handles video generation requests using LTX models and Varnish post-processing"""
-    def __init__(self, model_path: str = ""):
-        """Initialize the handler with LTX models and Varnish
-        Args:
-            model_path: Path to LTX model weights
-        """
-        print("EndpointHandler.__init__(): initializing..")
-        # Enable TF32 for potential speedup on Ampere GPUs
-        #torch.backends.cuda.matmul.allow_tf32 = True
-        # use distilled weights
-        model_path = Path("/repository/ltxv-2b-0.9.6-distilled-04-25.safetensors")
-        print("EndpointHandler.__init__(): initializing LTXVideoTransformer3DModel..")
-        transformer = LTXVideoTransformer3DModel.from_single_file(
-            model_path, torch_dtype=torch.bfloat16
         )
-        print("EndpointHandler.__init__(): initializing AutoencoderKLLTXVideo..")
-        vae = AutoencoderKLLTXVideo.from_single_file(model_path, torch_dtype=torch.bfloat16)
-        if support_image_prompt:
-            print("EndpointHandler.__init__(): initializing LTXImageToVideoPipeline..")
-            self.image_to_video = LTXImageToVideoPipeline.from_pretrained(
-                "/repository",
-                transformer=transformer,
-                vae=vae,
-                torch_dtype=torch.bfloat16
-            ).to("cuda")
-            #apply_teacache(self.image_to_video)
-            # Compilation requires some time to complete, so it is best suited for
-            # situations where you prepare your pipeline once and then perform the
-            # same type of inference operations multiple times.
-            # For example, calling the compiled pipeline on a different image size
-            # triggers compilation again which can be expensive.
-            #self.image_to_video = torch.compile(self.image_to_video, mode="reduce-overhead", fullgraph=True)
-        else:
-            print("EndpointHandler.__init__(): initializing LTXPipeline..")
-            # Initialize models with bfloat16 precision
-            self.text_to_video = LTXPipeline.from_pretrained(
-                "/repository",
-                transformer=transformer,
-                vae=vae,
-                torch_dtype=torch.bfloat16
-            ).to("cuda")
-            #apply_teacache(self.text_to_video)
-            # Compilation requires some time to complete, so it is best suited for
-            # situations where you prepare your pipeline once and then perform the
-            # same type of inference operations multiple times.
-            # For example, calling the compiled pipeline on a different image size
-            # triggers compilation again which can be expensive.
-            #self.text_to_video = torch.compile(self.text_to_video, mode="reduce-overhead", fullgraph=True)
-        # Initialize LoRA tracking
-        self._current_lora_model = None
-        #if support_image_prompt:
-        #    # Enable CPU offload for memory efficiency
-        #    self.image_to_video.enable_model_cpu_offload()
-        #    # Inject enhance-a-video functionality
-        #    inject_enhance_for_ltx(self.image_to_video.transformer)
-        #else:
-        #    # Enable CPU offload for memory efficiency
-        #    self.text_to_video.enable_model_cpu_offload()
-        #    # Inject enhance-a-video functionality
-        #    inject_enhance_for_ltx(self.text_to_video.transformer)
         # Initialize Varnish for post-processing
         self.varnish = Varnish(
             device="cuda",
             model_base_dir="/repository/varnish",
-            # there is currently a bug with MMAudio and/or torch and/or the weight format and/or version..
-            # not sure how to fix that.. :/
-            #
-            # it says:
-            #   File "dist-packages/varnish.py", line 152, in __init__
-            #     self._setup_mmaudio()
-            #   File "dist-packages/varnish/varnish.py", line 165, in _setup_mmaudio
-            #     net.load_weights(torch.load(model.model_path, map_location=self.device, weights_only=False))
-            #                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-            #   File "dist-packages/torch/serialization.py", line 1384, in load
-            #     return _legacy_load(
-            #            ^^^^^^^^^^^^^
-            #   File "dist-packages/torch/serialization.py", line 1628, in _legacy_load
-            #     magic_number = pickle_module.load(f, **pickle_load_args)
-            #                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-            # _pickle.UnpicklingError: invalid load key, '<'.
-            enable_mmaudio=False,
         )
-        # Determine if TeaCache is already installed or not
-        self.text_to_video_teacache = False
-        self.image_to_video_teacache = False
-    async def process_frames(
-        self,
-        frames: torch.Tensor,
-        config: GenerationConfig
-    ) -> tuple[str, dict]:
-        """Post-process generated frames using Varnish
-        Args:
-            frames: Generated video frames tensor
-            config: Generation configuration
-        Returns:
-            Tuple of (video data URI, metadata dictionary)
-        """
         try:
-            # Process video with Varnish
-            result = await self.varnish(
-                input_data=frames, # note: this might contain a certain number of frames eg. 97, which will get doubled if double_num_frames is True
-                fps=config.fps, # this is the FPS of the final output video. This number can be used by Varnish to calculate the duration of a clip ((using frames * factor) / fps etc)
-                double_num_frames=config.double_num_frames, # if True, the number of frames will be multiplied by 2 using RIFE
-                super_resolution=config.super_resolution, # if True, the resolution will be multiplied by 2 using Real_ESRGAN
-                grain_amount=config.grain_amount,
-                enable_audio=config.enable_audio,
-                audio_prompt=config.audio_prompt,
-                audio_negative_prompt=config.audio_negative_prompt,
-            )
-            # Convert to data URI
-            video_uri = await result.write(type="data-uri", quality=config.quality)
-            # Collect metadata
-            metadata = {
-                "width": result.metadata.width,
-                "height": result.metadata.height,
-                "num_frames": result.metadata.frame_count,
-                "fps": result.metadata.fps,
-                "duration": result.metadata.duration,
-                "seed": config.seed,
-            }
-            return video_uri, metadata
         except Exception as e:
-            logger.error(f"Error in process_frames: {str(e)}")
-            raise RuntimeError(f"Failed to process frames: {str(e)}")
     def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
-        """Process incoming requests for video generation
         Args:
-            data: Request data containing:
-                - inputs (dict): Dictionary containing input, which can be either "prompt" (text field) or "image" (input image)
-                - parameters (dict):
-                    - prompt (required, string): list of concepts to keep in the video.
-                    - negative_prompt (optional, string): list of concepts to ignore in the video.
-                    - width (optional, int, default to 768): width, or horizontal size in pixels.
-                    - height (optional, int, default to 512): height, or vertical size in pixels.
-                    - input_image_quality (optional, int, default to 100): this is a trick we use to convert a "pristine" image into a "dirty" video frame. This helps fooling LTX-Video into turning the image into an animated one.
-                    - num_frames (optional, int, default to 129): the numer of frames must be a multiple of 8, plus 1 frame.
-                    - guidance_scale (optional, float, default to 3.5): Guidance scale (values between 3.0 and 4.0 are nice)
-                    - num_inference_steps (optional, int, default to 50): number of inference steps
-                    - seed (optional, int, default to -1): set a random number generator seed, -1 means random seed.
-                    - fps (optional, int, default to 24): FPS of the final video (eg. 24, 25, 30, 60)
-                    - double_num_frames (optional, bool): if enabled, the number of frames will be multiplied by 2 using RIFE
-                    - super_resolution (optional, bool): if enabled, the resolution will be multiplied by 2 using Real_ESRGAN
-                    - grain_amount (optional, float): amount of film grain to add to the output video
-                    - enable_audio (optional, bool): automatically generate an audio track
-                    - audio_prompt (optional, str): prompt to use for the audio generation (concepts to add)
-                    - audio_negative_prompt (optional, str): nehative prompt to use for the audio generation (concepts to ignore)
-                    - quality (optional, str, default to 18): The range of the CRF scale is 0–51, where 0 is lossless (for 8 bit only, for 10 bit use -qp 0), 23 is the default, and 51 is worst quality possible.
-                    - enable_teacache (optional, bool, default to False): Generate faster at the cost of a slight quality loss
-                    - teacache_threshold (optional, float, default to 0.05): Amount of cache, 0 (original), 0.03 (1.6x speedup), 0.05 (Default, 2.1x speedup).
-                    - enable_enhance_a_video (optional, bool, default to False): enable the enhance_a_video optimization
-                    - enhance_a_video_weight(optional, float, default to 5.0): amount of video enhancement to apply
-                    - lora_model_name(optional, str, default to ""): HuggingFace repo ID or path to LoRA model
-                    - lora_model_weight_file(optional, str, default to ""): Specific weight file to load from the LoRA model
-                    - lora_model_trigger(optional, str, default to ""): Optional trigger word to prepend to the prompt
         Returns:
-            Dictionary containing:
-                - video: Base64 encoded MP4 data URI
-                - content-type: MIME type
-                - metadata: Generation metadata
         """
-        inputs = data.get("inputs", dict())
-        #print(inputs)
-        input_prompt = inputs.get("prompt", "")
-        input_image = inputs.get("image")
-        params = data.get("parameters", dict())
-        if not input_image and not input_prompt:
             raise ValueError("Either prompt or image must be provided")
-        #logger.debug(f"Raw parameters:")
-        # pprint.pprint(params)
         # Create and validate configuration
         config = GenerationConfig(
             # general content settings
             prompt=input_prompt,
             negative_prompt=params.get("negative_prompt", GenerationConfig.negative_prompt),
-            # video model settings (will be used during generation of the initial raw video clip)
             width=params.get("width", GenerationConfig.width),
             height=params.get("height", GenerationConfig.height),
             input_image_quality=params.get("input_image_quality", GenerationConfig.input_image_quality),
             num_frames=params.get("num_frames", GenerationConfig.num_frames),
             guidance_scale=params.get("guidance_scale", GenerationConfig.guidance_scale),
             num_inference_steps=params.get("num_inference_steps", GenerationConfig.num_inference_steps),
             # reproducible generation settings
             seed=params.get("seed", GenerationConfig.seed),
-            # varnish settings (will be used for post-processing after the raw video clip has been generated)
-            fps=params.get("fps", GenerationConfig.fps), # FPS of the final video (only applied at the the very end, when converting to mp4)
-            double_num_frames=params.get("double_num_frames", GenerationConfig.double_num_frames), # if True, the number of frames will be multiplied by 2 using RIFE
-            super_resolution=params.get("super_resolution", GenerationConfig.super_resolution), # if True, the resolution will be multiplied by 2 using Real_ESRGAN
             grain_amount=params.get("grain_amount", GenerationConfig.grain_amount),
             enable_audio=params.get("enable_audio", GenerationConfig.enable_audio),
             audio_prompt=params.get("audio_prompt", GenerationConfig.audio_prompt),
             audio_negative_prompt=params.get("audio_negative_prompt", GenerationConfig.audio_negative_prompt),
             quality=params.get("quality", GenerationConfig.quality),
-            # TeaCache settings
-            enable_teacache=params.get("enable_teacache", False),
-            # values: 0 (original), 0.03 (1.6x speedup), 0.05 (2.1x speedup).
-            teacache_threshold=params.get("teacache_threshold", 0.05),
-            # Add enhance-a-video settings
-            enable_enhance_a_video=params.get("enable_enhance_a_video", False),
-            enhance_a_video_weight=params.get("enhance_a_video_weight", 5.0),
-            # LoRA settings
-            lora_model_name=params.get("lora_model_name", ""),
-            lora_model_weight_file=params.get("lora_model_weight_file", ""),
-            lora_model_trigger=params.get("lora_model_trigger", ""),
         ).validate_and_adjust()
-        #logger.debug(f"Global request settings:")
-        #pprint.pprint(config)
         try:
-            with torch.amp.autocast_mode.autocast('cuda', torch.bfloat16), torch.no_grad(), torch.inference_mode():
-                # Set random seeds
                 random.seed(config.seed)
                 np.random.seed(config.seed)
                 torch.manual_seed(config.seed)
-                generator = torch.Generator(device='cuda')
-                generator = generator.manual_seed(config.seed)
-                # Configure enhance-a-video
-                #if config.enable_enhance_a_video:
-                #    enable_enhance()
-                #    set_enhance_weight(config.enhance_a_video_weight)
-                # Prepare generation parameters for the video model (we omit params that are destined to Varnish, or things like the seed which is set externally)
-                generation_kwargs = {
-                   # general content settings
-                    "prompt": config.prompt,
-                    "negative_prompt": config.negative_prompt,
-                    # video model settings (will be used during generation of the initial raw video clip)
-                    "width": config.width,
-                    "height": config.height,
-                    "num_frames": config.num_frames,
-                    "guidance_scale": config.guidance_scale,
-                    "num_inference_steps": config.num_inference_steps,
-                    # constants
-                    "output_type": "pt",
-                    "generator": generator,
-                    # Timestep for decoding VAE noise: the timestep at which generated video is decoded
-                    "decode_timestep": 0.05,
-                    # Noise level for decoding VAE noise: the interpolation factor between random noise and denoised latents at the decode timestep
-                    "decode_noise_scale": 0.025,
-                }
-                #logger.info(f"Video model generation settings:")
-                #pprint.pprint(generation_kwargs)
-                # Handle LoRA loading/unloading
-                if hasattr(self, '_current_lora_model'):
-                    if self._current_lora_model != (config.lora_model_name, config.lora_model_weight_file):
-                        # Unload previous LoRA if it exists and is different
-                        if hasattr(self.text_to_video, 'unload_lora_weights'):
-                            print("Unloading LoRA weights for the text_to_video pipeline..")
-                            self.text_to_video.unload_lora_weights()
-                        if support_image_prompt and hasattr(self.image_to_video, 'unload_lora_weights'):
-                            print("Unloading LoRA weights for the image_to_video pipeline..")
-                            self.image_to_video.unload_lora_weights()
-                if config.lora_model_name:
-                    # Load new LoRA
-                    if hasattr(self.text_to_video, 'load_lora_weights'):
-                        print("Loading LoRA weights for the text_to_video pipeline..")
-                        self.text_to_video.load_lora_weights(
-                            config.lora_model_name,
-                            weight_name=config.lora_model_weight_file if config.lora_model_weight_file else None,
-                            token=hf_token,
-                        )
-                    if support_image_prompt and hasattr(self.image_to_video, 'load_lora_weights'):
-                        print("Loading LoRA weights for the image_to_video pipeline..")
-                        self.image_to_video.load_lora_weights(
-                            config.lora_model_name,
-                            weight_name=config.lora_model_weight_file if config.lora_model_weight_file else None,
-                            token=hf_token,
                         )
-                    self._current_lora_model = (config.lora_model_name, config.lora_model_weight_file)
-                # Modify prompt if trigger word is provided
-                if config.lora_model_trigger:
-                    generation_kwargs["prompt"] = f"{config.lora_model_trigger} {generation_kwargs['prompt']}"
-                #enhance_a_video_config = EnhanceAVideoConfig(
-                #    weight=config.enhance_a_video_weight if config.enable_enhance_a_video else 0.0,
-                #    # doing some testing
-                #    num_frames_callback=lambda: (8 + 1),
-                #    # num_frames_callback=lambda: config.num_frames,
-                #    # num_frames_callback=lambda: (config.num_frames - 1),
-                #
-                #    _attention_type=1
-                #)
-                # Check if image-to-video generation is requested
-                if support_image_prompt and input_image:
-                    processed_image = process_input_image(
-                        input_image,
-                        config.width,
-                        config.height,
-                        config.input_image_quality,
-                    )
-                    generation_kwargs["image"] = processed_image
-                    # disabled (we cannot install the hook multiple times, we would have to uninstall it first or find another way to dynamically enable it, eg. using the weight only)
-                    # apply_enhance_a_video(self.image_to_video.transformer, enhance_a_video_config)
-                    frames = self.image_to_video(**generation_kwargs).frames
-                else:
-                    # disabled (we cannot install the hook multiple times, we would have to uninstall it first or find another way to dynamically enable it, eg. using the weight only)
-                    # apply_enhance_a_video(self.text_to_video.transformer, enhance_a_video_config)
-                    frames = self.text_to_video(**generation_kwargs).frames
                 try:
                     loop = asyncio.get_event_loop()
                 except RuntimeError:
                     loop = asyncio.new_event_loop()
                     asyncio.set_event_loop(loop)
-                video_uri, metadata = loop.run_until_complete(self.process_frames(frames, config))
                 torch.cuda.empty_cache()
-                torch.cuda.reset_peak_memory_stats()
                 gc.collect()
                 return {
@@ -503,8 +663,10 @@ class EndpointHandler:
                     "content-type": "video/mp4",
                     "metadata": metadata
                 }
         except Exception as e:
-            message = f"Error generating video ({str(e)})\n{traceback.format_exc()}"
-            print(message)
-            raise RuntimeError(message)

 from dataclasses import dataclass
 from pathlib import Path
 import logging
+import base64
 import random
+import gc
 import os
 import numpy as np
 import torch
+from typing import Dict, Any, Optional, List, Union, Tuple
+import json
+from safetensors import safe_open
+from ltx_video.models.autoencoders.causal_video_autoencoder import CausalVideoAutoencoder
+from ltx_video.models.transformers.transformer3d import Transformer3DModel
+from ltx_video.models.transformers.symmetric_patchifier import SymmetricPatchifier
+from ltx_video.schedulers.rf import RectifiedFlowScheduler, TimestepShifter
+from ltx_video.pipelines.pipeline_ltx_video import ConditioningItem, LTXVideoPipeline
+from ltx_video.utils.skip_layer_strategy import SkipLayerStrategy
+from transformers import T5EncoderModel, T5Tokenizer, AutoModelForCausalLM, AutoProcessor, AutoTokenizer
 from varnish import Varnish
 from varnish.utils import is_truthy, process_input_image
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 # Get token from environment
 hf_token = os.getenv("HF_API_TOKEN")
 # Constraints
 MAX_LARGE_SIDE = 1280
+MAX_SMALL_SIDE = 768  # should be 720 but it must be divisible by 32
+MAX_FRAMES = (8 * 21) + 1  # visual glitches appear after about 169 frames, so we cap it
 # Check environment variable for pipeline support
 support_image_prompt = is_truthy(os.getenv("SUPPORT_INPUT_IMAGE_PROMPT"))
     negative_prompt: str = "saturated, highlight, overexposed, highlighted, overlit, shaking, too bright, worst quality, inconsistent motion, blurry, jittery, distorted, cropped, watermarked, watermark, logo, subtitle, subtitles, lowres"
     # video model settings (will be used during generation of the initial raw video clip)
+    width: int = 1216 # 768
+    height: int = 704 # 416
     # this is a hack to fool LTX-Video into believing our input image is an actual video frame with poor encoding quality
     # after a quick benchmark using the value 70 seems like a sweet spot
     # visual glitches appear after about 169 frames, so we don't need more actually
     num_frames: int = (8 * 14) + 1
+    # values between 3.0 and 4.0 are nice
+    guidance_scale: float = 3.0
     num_inference_steps: int = 8
     seed: int = -1  # -1 means random seed
     # varnish settings (will be used for post-processing after the raw video clip has been generated
+    fps: int = 30  # FPS of the final video (only applied at the very end, when converting to mp4)
+    double_num_frames: bool = False  # if True, the number of frames will be multiplied by 2 using RIFE
+    super_resolution: bool = False  # if True, the resolution will be multiplied by 2 using Real_ESRGAN
+    grain_amount: float = 0.0  # be careful, adding film grain can negatively impact video compression
     # audio settings
     enable_audio: bool = False  # Whether to generate audio
     audio_prompt: str = ""  # Text prompt for audio generation
+    audio_negative_prompt: str = "voices, voice, talking, speaking, speech"  # Negative prompt for audio generation
     # The range of the CRF scale is 0–51, where:
     # 0 is lossless (for 8 bit only, for 10 bit use -qp 0)
     # The range is exponential, so increasing the CRF value +6 results in roughly half the bitrate / file size, while -6 leads to roughly twice the bitrate.
     quality: int = 18
+    # STG (Spatiotemporal Guidance) settings
+    stg_scale: float = 0.0
+    stg_rescale: float = 1.0
+    stg_mode: str = "attention_values"  # Can be "attention_values", "attention_skip", "residual", or "transformer_block"
+    # VAE noise augmentation
+    decode_timestep: float = 0.05
+    decode_noise_scale: float = 0.025
+    # Other advanced settings
+    image_cond_noise_scale: float = 0.15
+    mixed_precision: bool = True  # Use mixed precision for inference
+    stochastic_sampling: bool = True  # Use stochastic sampling
+    # Sampling settings
+    sampler: Optional[str] = "from_checkpoint"  # "uniform" or "linear-quadratic" or None (use default from checkpoint)
+    # Prompt enhancement
+    enhance_prompt: bool = False  # Whether to enhance the prompt using an LLM
+    prompt_enhancement_words_threshold: int = 50  # Enhance prompt only if it has fewer words than this
     def validate_and_adjust(self) -> 'GenerationConfig':
         """Validate and adjust parameters to meet constraints"""
         if not ((self.width == MAX_LARGE_SIDE and self.height == MAX_SMALL_SIDE) or
                 (self.width == MAX_SMALL_SIDE and self.height == MAX_LARGE_SIDE)):
             # For other resolutions, ensure total pixels don't exceed max
+            MAX_TOTAL_PIXELS = MAX_SMALL_SIDE * MAX_LARGE_SIDE  # or 921600 = 1280 * 720
             # If total pixels exceed maximum, scale down proportionally
             total_pixels = self.width * self.height
         # Set random seed if not specified
         if self.seed == -1:
             self.seed = random.randint(0, 2**32 - 1)
+        # Set up STG parameters
+        if self.stg_mode.lower() == "stg_av" or self.stg_mode.lower() == "attention_values":
+            self.stg_mode = "attention_values"
+        elif self.stg_mode.lower() == "stg_as" or self.stg_mode.lower() == "attention_skip":
+            self.stg_mode = "attention_skip"
+        elif self.stg_mode.lower() == "stg_r" or self.stg_mode.lower() == "residual":
+            self.stg_mode = "residual"
+        elif self.stg_mode.lower() == "stg_t" or self.stg_mode.lower() == "transformer_block":
+            self.stg_mode = "transformer_block"
+        # Check if we should enhance the prompt
+        if self.enhance_prompt and self.prompt:
+            prompt_word_count = len(self.prompt.split())
+            if prompt_word_count >= self.prompt_enhancement_words_threshold:
+                logger.info(f"Prompt has {prompt_word_count} words, which exceeds the threshold of {self.prompt_enhancement_words_threshold}. Prompt enhancement disabled.")
+                self.enhance_prompt = False
+        return self
+def load_image_to_tensor_with_resize_and_crop(
+    image_input: Union[str, bytes],
+    target_height: int = 704,
+    target_width: int = 1216,
+    quality: int = 100
+) -> torch.Tensor:
+    """Load and process an image into a tensor.
+    Args:
+        image_input: Either a file path (str) or image data (bytes)
+        target_height: Desired height of output tensor
+        target_width: Desired width of output tensor
+        quality: JPEG quality to use when re-encoding (to simulate lower quality images)
+    """
+    from PIL import Image
+    import io
+    import numpy as np
+    # Handle base64 data URI
+    if isinstance(image_input, str) and image_input.startswith('data:'):
+        header, encoded = image_input.split(",", 1)
+        image_data = base64.b64decode(encoded)
+        image = Image.open(io.BytesIO(image_data)).convert("RGB")
+    # Handle raw bytes
+    elif isinstance(image_input, bytes):
+        image = Image.open(io.BytesIO(image_input)).convert("RGB")
+    # Handle file path
+    elif isinstance(image_input, str):
+        image = Image.open(image_input).convert("RGB")
+    else:
+        raise ValueError("image_input must be either a file path, bytes, or base64 data URI")
+    # Apply JPEG compression if quality < 100 (to simulate a video frame)
+    if quality < 100:
+        buffer = io.BytesIO()
+        image.save(buffer, format="JPEG", quality=quality)
+        buffer.seek(0)
+        image = Image.open(buffer).convert("RGB")
+    input_width, input_height = image.size
+    aspect_ratio_target = target_width / target_height
+    aspect_ratio_frame = input_width / input_height
+    if aspect_ratio_frame > aspect_ratio_target:
+        new_width = int(input_height * aspect_ratio_target)
+        new_height = input_height
+        x_start = (input_width - new_width) // 2
+        y_start = 0
+    else:
+        new_width = input_width
+        new_height = int(input_width / aspect_ratio_target)
+        x_start = 0
+        y_start = (input_height - new_height) // 2
+    image = image.crop((x_start, y_start, x_start + new_width, y_start + new_height))
+    image = image.resize((target_width, target_height))
+    frame_tensor = torch.tensor(np.array(image)).permute(2, 0, 1).float()
+    frame_tensor = (frame_tensor / 127.5) - 1.0
+    # Create 5D tensor: (batch_size=1, channels=3, num_frames=1, height, width)
+    return frame_tensor.unsqueeze(0).unsqueeze(2)
+def calculate_padding(
+    source_height: int, source_width: int, target_height: int, target_width: int
+) -> tuple[int, int, int, int]:
+    """Calculate padding to reach target dimensions"""
+    # Calculate total padding needed
+    pad_height = target_height - source_height
+    pad_width = target_width - source_width
+    # Calculate padding for each side
+    pad_top = pad_height // 2
+    pad_bottom = pad_height - pad_top  # Handles odd padding
+    pad_left = pad_width // 2
+    pad_right = pad_width - pad_left  # Handles odd padding
+    # Return padded tensor
+    # Padding format is (left, right, top, bottom)
+    padding = (pad_left, pad_right, pad_top, pad_bottom)
+    return padding
+def prepare_conditioning(
+    conditioning_media_paths: List[str],
+    conditioning_strengths: List[float],
+    conditioning_start_frames: List[int],
+    height: int,
+    width: int,
+    num_frames: int,
+    input_image_quality: int = 100,
+    pipeline: Optional[LTXVideoPipeline] = None,
+) -> Optional[List[ConditioningItem]]:
+    """Prepare conditioning items based on input media paths and their parameters"""
+    conditioning_items = []
+    for path, strength, start_frame in zip(
+        conditioning_media_paths, conditioning_strengths, conditioning_start_frames
+    ):
+        # Load and process the conditioning image
+        frame_tensor = load_image_to_tensor_with_resize_and_crop(
+            path, height, width, quality=input_image_quality
+        )
+        # Trim frame count if needed
+        if pipeline:
+            frame_count = 1  # For image inputs, it's always 1
+            frame_count = pipeline.trim_conditioning_sequence(
+                start_frame, frame_count, num_frames
+            )
+        conditioning_items.append(
+            ConditioningItem(frame_tensor, start_frame, strength)
         )
+    return conditioning_items
+def create_ltx_video_pipeline(
+    config: GenerationConfig,
+    device: str = "cuda"
+) -> LTXVideoPipeline:
+    """Create and configure the LTX video pipeline"""
+    ckpt_path = "/repository/ltxv-2b-0.9.6-distilled-04-25.safetensors"
+    # Get allowed inference steps from config if available
+    allowed_inference_steps = None
+    assert os.path.exists(
+        ckpt_path
+    ), f"Ckpt path provided (--ckpt_path) {ckpt_path} does not exist"
+    with safe_open(ckpt_path, framework="pt") as f:
+        metadata = f.metadata()
+        config_str = metadata.get("config")
+        configs = json.loads(config_str)
+        allowed_inference_steps = configs.get("allowed_inference_steps", None)
+    # Initialize model components
+    vae = CausalVideoAutoencoder.from_pretrained(ckpt_path)
+    transformer = Transformer3DModel.from_pretrained(ckpt_path)
+    # Use constructor if sampler is specified, otherwise use from_pretrained
+    if config.sampler:
+        scheduler = RectifiedFlowScheduler(
+            sampler=("Uniform" if config.sampler.lower() == "uniform" else "LinearQuadratic")
+        )
+    else:
+        scheduler = RectifiedFlowScheduler.from_pretrained(ckpt_path)
+    text_encoder = T5EncoderModel.from_pretrained("/repository/text_encoder")
+    patchifier = SymmetricPatchifier(patch_size=1)
+    tokenizer = T5Tokenizer.from_pretrained("/repository/tokenizer")
+    # Move models to the correct device
+    vae = vae.to(device)
+    transformer = transformer.to(device)
+    text_encoder = text_encoder.to(device)
+    # Set up precision
+    vae = vae.to(torch.bfloat16)
+    transformer = transformer.to(torch.bfloat16)
+    text_encoder = text_encoder.to(torch.bfloat16)
+    # Initialize prompt enhancer components if needed
+    prompt_enhancer_components = {
+        "prompt_enhancer_image_caption_model": None,
+        "prompt_enhancer_image_caption_processor": None,
+        "prompt_enhancer_llm_model": None,
+        "prompt_enhancer_llm_tokenizer": None
+    }
+    if config.enhance_prompt:
+        try:
+            # Use default models or ones specified by config
+            prompt_enhancer_image_caption_model = AutoModelForCausalLM.from_pretrained(
+                "MiaoshouAI/Florence-2-large-PromptGen-v2.0",
+                trust_remote_code=True
+            )
+            prompt_enhancer_image_caption_processor = AutoProcessor.from_pretrained(
+                "MiaoshouAI/Florence-2-large-PromptGen-v2.0",
+                trust_remote_code=True
+            )
+            prompt_enhancer_llm_model = AutoModelForCausalLM.from_pretrained(
+                "unsloth/Llama-3.2-3B-Instruct",
+                torch_dtype="bfloat16",
+            )
+            prompt_enhancer_llm_tokenizer = AutoTokenizer.from_pretrained(
+                "unsloth/Llama-3.2-3B-Instruct",
+            )
+            prompt_enhancer_components = {
+                "prompt_enhancer_image_caption_model": prompt_enhancer_image_caption_model,
+                "prompt_enhancer_image_caption_processor": prompt_enhancer_image_caption_processor,
+                "prompt_enhancer_llm_model": prompt_enhancer_llm_model,
+                "prompt_enhancer_llm_tokenizer": prompt_enhancer_llm_tokenizer
+            }
+        except Exception as e:
+            logger.warning(f"Failed to load prompt enhancer models: {e}")
+            config.enhance_prompt = False
+    # Construct the pipeline
+    pipeline = LTXVideoPipeline(
+        transformer=transformer,
+        patchifier=patchifier,
+        text_encoder=text_encoder,
+        tokenizer=tokenizer,
+        scheduler=scheduler,
+        vae=vae,
+        allowed_inference_steps=allowed_inference_steps,
+        **prompt_enhancer_components
+    )
+    return pipeline
+class EndpointHandler:
+    """Handler for the LTX Video endpoint"""
+    def __init__(self, model_path: str = "/repository/"):
+        """Initialize the endpoint handler
+        Args:
+            model_path: Path to model weights (not used, as weights are in current directory)
+        """
+        # Enable TF32 for potential speedup on Ampere GPUs
+        torch.backends.cuda.matmul.allow_tf32 = True
         # Initialize Varnish for post-processing
         self.varnish = Varnish(
             device="cuda",
             model_base_dir="/repository/varnish",
+            enable_mmaudio=False,  # Disable audio generation for now, since it is broken
         )
+        # The actual LTX pipeline will be loaded during inference to save memory
+        self.pipeline = None
+        # Perform warm-up inference
+        logger.info("Performing warm-up inference...")
+        self._warmup()
+        logger.info("Warm-up completed!")
+    def _warmup(self):
+        """Perform a warm-up inference to prepare the model for future requests"""
         try:
+            # Create a simple test configuration
+            test_config = GenerationConfig(
+                prompt="an astronaut is riding a cow in the desert, during golden hour",
+                negative_prompt="worst quality, lowres",
+                width=768,  # Using smaller resolution for faster warm-up
+                height=416,
+                num_frames=33,  # Just enough frames for a valid video
+                guidance_scale=1.0,
+                num_inference_steps=4,  # Fewer steps for faster warm-up
+                seed=42,  # Fixed seed for consistent warm-up
+                fps=16,  # Lower FPS for faster processing
+                enable_audio=False,  # No audio for warm-up
+                mixed_precision=True,
+            ).validate_and_adjust()
+            # Create the pipeline if it doesn't exist
+            if self.pipeline is None:
+                self.pipeline = create_ltx_video_pipeline(test_config)
+            # Run a quick inference
+            with torch.amp.autocast(device_type='cuda', dtype=torch.bfloat16), torch.no_grad():
+                # Set seeds for reproducibility
+                random.seed(test_config.seed)
+                np.random.seed(test_config.seed)
+                torch.manual_seed(test_config.seed)
+                generator = torch.Generator(device='cuda').manual_seed(test_config.seed)
+                # Generate video
+                result = self.pipeline(
+                    height=test_config.height,
+                    width=test_config.width,
+                    num_frames=test_config.num_frames,
+                    frame_rate=test_config.fps,
+                    prompt=test_config.prompt,
+                    negative_prompt=test_config.negative_prompt,
+                    guidance_scale=test_config.guidance_scale,
+                    num_inference_steps=test_config.num_inference_steps,
+                    generator=generator,
+                    output_type="pt",
+                    mixed_precision=test_config.mixed_precision,
+                    is_video=True,
+                    vae_per_channel_normalize=True,
+                )
+                # Just get the frames without full processing (faster warm-up)
+                frames = result.images
+                # Clean up
+                del result
+                torch.cuda.empty_cache()
+                gc.collect()
+                logger.info(f"Warm-up successful! Generated {frames.shape[2]} frames at {frames.shape[3]}x{frames.shape[4]}")
         except Exception as e:
+            # Log the error but don't fail initialization
+            import traceback
+            error_message = f"Warm-up failed (but this is non-critical): {str(e)}\n{traceback.format_exc()}"
+            logger.warning(error_message)
     def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        """Process inference requests
         Args:
+            data: Request data containing inputs and parameters
         Returns:
+            Dictionary with generated video and metadata
         """
+        # Extract inputs and parameters
+        inputs = data.get("inputs", {})
+        # Support both formats:
+        # 1. {"inputs": {"prompt": "...", "image": "..."}}
+        # 2. {"inputs": "..."} (prompt only)
+        if isinstance(inputs, str):
+            input_prompt = inputs
+            input_image = None
+        else:
+            input_prompt = inputs.get("prompt", "")
+            input_image = inputs.get("image")
+        params = data.get("parameters", {})
+        if not input_prompt and not input_image:
             raise ValueError("Either prompt or image must be provided")
         # Create and validate configuration
         config = GenerationConfig(
             # general content settings
             prompt=input_prompt,
             negative_prompt=params.get("negative_prompt", GenerationConfig.negative_prompt),
+            # video model settings
             width=params.get("width", GenerationConfig.width),
             height=params.get("height", GenerationConfig.height),
             input_image_quality=params.get("input_image_quality", GenerationConfig.input_image_quality),
             num_frames=params.get("num_frames", GenerationConfig.num_frames),
             guidance_scale=params.get("guidance_scale", GenerationConfig.guidance_scale),
             num_inference_steps=params.get("num_inference_steps", GenerationConfig.num_inference_steps),
+            # STG settings
+            stg_scale=params.get("stg_scale", GenerationConfig.stg_scale),
+            stg_rescale=params.get("stg_rescale", GenerationConfig.stg_rescale),
+            stg_mode=params.get("stg_mode", GenerationConfig.stg_mode),
+            # VAE noise settings
+            decode_timestep=params.get("decode_timestep", GenerationConfig.decode_timestep),
+            decode_noise_scale=params.get("decode_noise_scale", GenerationConfig.decode_noise_scale),
+            image_cond_noise_scale=params.get("image_cond_noise_scale", GenerationConfig.image_cond_noise_scale),
             # reproducible generation settings
             seed=params.get("seed", GenerationConfig.seed),
+            # varnish settings
+            fps=params.get("fps", GenerationConfig.fps),
+            double_num_frames=params.get("double_num_frames", GenerationConfig.double_num_frames),
+            super_resolution=params.get("super_resolution", GenerationConfig.super_resolution),
             grain_amount=params.get("grain_amount", GenerationConfig.grain_amount),
             enable_audio=params.get("enable_audio", GenerationConfig.enable_audio),
             audio_prompt=params.get("audio_prompt", GenerationConfig.audio_prompt),
             audio_negative_prompt=params.get("audio_negative_prompt", GenerationConfig.audio_negative_prompt),
             quality=params.get("quality", GenerationConfig.quality),
+            # advanced settings
+            mixed_precision=params.get("mixed_precision", GenerationConfig.mixed_precision),
+            stochastic_sampling=params.get("stochastic_sampling", GenerationConfig.stochastic_sampling),
+            sampler=params.get("sampler", GenerationConfig.sampler),
+            # prompt enhancement
+            enhance_prompt=params.get("enhance_prompt", GenerationConfig.enhance_prompt),
+            prompt_enhancement_words_threshold=params.get(
+                "prompt_enhancement_words_threshold",
+                GenerationConfig.prompt_enhancement_words_threshold
+            ),
         ).validate_and_adjust()
         try:
+            with torch.amp.autocast(device_type='cuda', dtype=torch.bfloat16), torch.no_grad():
+                # Set random seeds for reproducibility
                 random.seed(config.seed)
                 np.random.seed(config.seed)
                 torch.manual_seed(config.seed)
+                generator = torch.Generator(device='cuda').manual_seed(config.seed)
+                # Create pipeline if not already created
+                if self.pipeline is None:
+                    self.pipeline = create_ltx_video_pipeline(config)
+                # Prepare conditioning items if an image is provided
+                conditioning_items = None
+                if input_image:
+                    conditioning_items = [
+                        ConditioningItem(
+                            load_image_to_tensor_with_resize_and_crop(
+                                input_image,
+                                config.height,
+                                config.width,
+                                quality=config.input_image_quality
+                            ),
+                            0,  # Start frame
+                            1.0  # Conditioning strength
                         )
+                    ]
+                # Set up spatiotemporal guidance strategy
+                if config.stg_mode == "attention_values":
+                    skip_layer_strategy = SkipLayerStrategy.AttentionValues
+                elif config.stg_mode == "attention_skip":
+                    skip_layer_strategy = SkipLayerStrategy.AttentionSkip
+                elif config.stg_mode == "residual":
+                    skip_layer_strategy = SkipLayerStrategy.Residual
+                elif config.stg_mode == "transformer_block":
+                    skip_layer_strategy = SkipLayerStrategy.TransformerBlock
+                # Generate video with LTX pipeline
+                result = self.pipeline(
+                    height=config.height,
+                    width=config.width,
+                    num_frames=config.num_frames,
+                    frame_rate=config.fps,
+                    prompt=config.prompt,
+                    negative_prompt=config.negative_prompt,
+                    guidance_scale=config.guidance_scale,
+                    num_inference_steps=config.num_inference_steps,
+                    generator=generator,
+                    output_type="pt",  # Return as PyTorch tensor
+                    skip_layer_strategy=skip_layer_strategy,
+                    stg_scale=config.stg_scale,
+                    do_rescaling=config.stg_rescale != 1.0,
+                    rescaling_scale=config.stg_rescale,
+                    conditioning_items=conditioning_items,
+                    decode_timestep=config.decode_timestep,
+                    decode_noise_scale=config.decode_noise_scale,
+                    image_cond_noise_scale=config.image_cond_noise_scale,
+                    mixed_precision=config.mixed_precision,
+                    is_video=True,
+                    vae_per_channel_normalize=True,
+                    stochastic_sampling=config.stochastic_sampling,
+                    enhance_prompt=config.enhance_prompt,
+                )
+                # Get the generated frames
+                frames = result.images
+                # FIX: Convert LTX output format to varnish-compatible format
+                # LTX outputs: [batch, channels, frames, height, width]
+                # We need: [frames, channels, height, width] for varnish
+                frames = frames.squeeze(0)  # Remove batch: [channels, frames, height, width]
+                frames = frames.permute(1, 0, 2, 3)  # Reorder to: [frames, channels, height, width]
+                # Convert from [0, 1] to [0, 255] range
+                frames = frames * 255.0
+                # Convert to uint8
+                frames = frames.to(torch.uint8)
+                # Process the generated frames with Varnish
+                import asyncio
                 try:
                     loop = asyncio.get_event_loop()
                 except RuntimeError:
                     loop = asyncio.new_event_loop()
                     asyncio.set_event_loop(loop)
+                # Process with Varnish for post-processing
+                varnish_result = loop.run_until_complete(
+                    self.varnish(
+                        frames,
+                        fps=config.fps,
+                        double_num_frames=config.double_num_frames,
+                        super_resolution=config.super_resolution,
+                        grain_amount=config.grain_amount,
+                        enable_audio=config.enable_audio,
+                        audio_prompt=config.audio_prompt or config.prompt,
+                        audio_negative_prompt=config.audio_negative_prompt,
+                    )
+                )
+                # Get the final video as a data URI
+                video_uri = loop.run_until_complete(
+                    varnish_result.write(
+                        type="data-uri",
+                        quality=config.quality
+                    )
+                )
+                # Prepare metadata about the generated video
+                metadata = {
+                    "width": varnish_result.metadata.width,
+                    "height": varnish_result.metadata.height,
+                    "num_frames": varnish_result.metadata.frame_count,
+                    "fps": varnish_result.metadata.fps,
+                    "duration": varnish_result.metadata.duration,
+                    "seed": config.seed,
+                    "prompt": config.prompt,
+                }
+                # Clean up to prevent CUDA OOM errors
+                del result
                 torch.cuda.empty_cache()
                 gc.collect()
                 return {
                     "content-type": "video/mp4",
                     "metadata": metadata
                 }
         except Exception as e:
+            # Log the error and reraise
+            import traceback
+            error_message = f"Error generating video: {str(e)}\n{traceback.format_exc()}"
+            logger.error(error_message)
+            raise RuntimeError(error_message)