finetrainers/dummy-hunyaunvideo

Model created with:

Code

import torch 
from diffusers import HunyuanVideoTransformer3DModel, AutoencoderKLHunyuanVideo, FlowMatchEulerDiscreteScheduler, HunyuanVideoPipeline
from transformers import LlamaConfig, LlamaModel, CLIPTextModel, CLIPTextConfig, AutoTokenizer, CLIPTokenizer

def get_dummy_components(num_layers=1, num_single_layers=1):
    torch.manual_seed(0)
    transformer = HunyuanVideoTransformer3DModel(
        in_channels=4,
        out_channels=4,
        num_attention_heads=2,
        attention_head_dim=10,
        num_layers=num_layers,
        num_single_layers=num_single_layers,
        num_refiner_layers=1,
        patch_size=1,
        patch_size_t=1,
        guidance_embeds=True,
        text_embed_dim=16,
        pooled_projection_dim=8,
        rope_axes_dim=(2, 4, 4),
    )

    torch.manual_seed(0)
    vae = AutoencoderKLHunyuanVideo(
        in_channels=3,
        out_channels=3,
        latent_channels=4,
        down_block_types=(
            "HunyuanVideoDownBlock3D",
            "HunyuanVideoDownBlock3D",
            "HunyuanVideoDownBlock3D",
            "HunyuanVideoDownBlock3D",
        ),
        up_block_types=(
            "HunyuanVideoUpBlock3D",
            "HunyuanVideoUpBlock3D",
            "HunyuanVideoUpBlock3D",
            "HunyuanVideoUpBlock3D",
        ),
        block_out_channels=(8, 8, 8, 8),
        layers_per_block=1,
        act_fn="silu",
        norm_num_groups=4,
        scaling_factor=0.476986,
        spatial_compression_ratio=8,
        temporal_compression_ratio=4,
        mid_block_add_attention=True,
    )

    torch.manual_seed(0)
    scheduler = FlowMatchEulerDiscreteScheduler(shift=7.0)

    llama_text_encoder_config = LlamaConfig(
        bos_token_id=0,
        eos_token_id=2,
        hidden_size=16,
        intermediate_size=37,
        layer_norm_eps=1e-05,
        num_attention_heads=4,
        num_hidden_layers=2,
        pad_token_id=1,
        vocab_size=1000,
        hidden_act="gelu",
        projection_dim=32,
    )
    clip_text_encoder_config = CLIPTextConfig(
        bos_token_id=0,
        eos_token_id=2,
        hidden_size=8,
        intermediate_size=37,
        layer_norm_eps=1e-05,
        num_attention_heads=4,
        num_hidden_layers=2,
        pad_token_id=1,
        vocab_size=1000,
        hidden_act="gelu",
        projection_dim=32,
    )

    torch.manual_seed(0)
    text_encoder = LlamaModel(llama_text_encoder_config)
    tokenizer = AutoTokenizer.from_pretrained("llama_small_tokenizer")

    torch.manual_seed(0)
    text_encoder_2 = CLIPTextModel(clip_text_encoder_config)
    tokenizer_2 = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")

    components = {
        "transformer": transformer,
        "vae": vae,
        "scheduler": scheduler,
        "text_encoder": text_encoder,
        "text_encoder_2": text_encoder_2,
        "tokenizer": tokenizer,
        "tokenizer_2": tokenizer_2,
    }
    return components

components = get_dummy_components()
pipeline = HunyuanVideoPipeline(**components)

inputs = {
    "prompt": "sky is dark moon is yellow",
    "prompt_template": {
        "template": "{}",
        "crop_start": 0,
    },
    "num_inference_steps": 2,
    "guidance_scale": 4.5,
    "height": 16,
    "width": 16,
    # 4 * k + 1 is the recommendation
    "num_frames": 9,
    "max_sequence_length": 16,
    "output_type": "pt",
}

pipeline(**inputs)

pipeline.push_to_hub("finetrainers/dummy-hunyaunvideo")

Tokenizer created with (thanks ChatGPT):

Code

#!/usr/bin/env python

"""
Small script to:
 1) Create a tiny text corpus
 2) Train a SentencePiece tokenizer (unigram) on that corpus
 3) Wrap it with Hugging Face's LlamaTokenizer
 4) Test encode/decode
 5) Save and reload the tokenizer

Dependencies:
    pip install transformers sentencepiece
"""

import os
import sentencepiece as spm
from transformers import LlamaTokenizer

def main():
    # ---------------------------------------------------------
    # 1. Prepare a Tiny Corpus
    # ---------------------------------------------------------
    small_corpus = """Hello Llama!
    This is a small corpus for testing.
    Llama is a language model family by Meta.
    """
    
    corpus_file = "small_corpus.txt"
    with open(corpus_file, "w", encoding="utf-8") as f:
        f.write(small_corpus)
    
    # ---------------------------------------------------------
    # 2. Train a Small SentencePiece Model
    # ---------------------------------------------------------
    # Note: LLaMA typically uses a vocab of ~32k tokens.
    # Here, we use vocab_size=64 just for demonstration.
    model_prefix = "llama_small"
    
    spm.SentencePieceTrainer.Train(
        input=corpus_file,
        model_prefix=model_prefix,
        vocab_size=36,
        model_type="unigram",  # LLaMA uses 'unigram'
        character_coverage=1.0,
        pad_id=0,   # PAD=0
        unk_id=1,   # UNK=1
        bos_id=2,   # BOS=2
        eos_id=3    # EOS=3
    )
    
    model_file =  model_prefix + ".model"
    vocab_file = model_prefix + ".vocab"
    
    if not (os.path.exists(model_file) and os.path.exists(vocab_file)):
        raise FileNotFoundError("SentencePiece training output not found.")
    
    print("Tokenizer training complete!")
    print(f"Generated files: {model_file}, {vocab_file}")
    
    # ---------------------------------------------------------
    # 3. Use the Trained Model with LlamaTokenizer
    # ---------------------------------------------------------
    tokenizer = LlamaTokenizer(model_file)
    tokenizer.add_special_tokens({'pad_token': "<unk>", "bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>"})
    
    # ---------------------------------------------------------
    # 4. Test the Tokenizer (Encode/Decode)
    # ---------------------------------------------------------
    test_text = "Hello Llama, how are you?"
    encoded = tokenizer(test_text)
    decoded = tokenizer.decode(encoded["input_ids"])
    
    print("\n--- Tokenizer Test ---")
    print("Original text: ", test_text)
    print("Encoded:       ", encoded)
    print("Decoded:       ", decoded)
    
    # ---------------------------------------------------------
    # 5. Save and Reload the Tokenizer
    # ---------------------------------------------------------
    save_directory = "llama_small_tokenizer"
    os.makedirs(save_directory, exist_ok=True)
    tokenizer.save_pretrained(save_directory)
    
    # Reload
    new_tokenizer = LlamaTokenizer.from_pretrained(save_directory)
    
    # Verify it works
    encoded_again = new_tokenizer(test_text)
    decoded_again = new_tokenizer.decode(encoded_again["input_ids"])
    
    print("\n--- Reloaded Tokenizer Test ---")
    print("Original text: ", test_text)
    print("Encoded:       ", encoded_again)
    print("Decoded:       ", decoded_again)

if __name__ == "__main__":
    main()