Upload 3 files

Browse files

Files changed (3) hide show

download_checkpoints.py +139 -0
joint_loss.py +510 -0
prepare_dataset.py +320 -0

download_checkpoints.py ADDED Viewed

	@@ -0,0 +1,139 @@

+import subprocess
+import sys
+import time
+import argparse
+import os
+import datetime
+def main():
+    parser = argparse.ArgumentParser(description="Download checkpoint pair from remote runpod machine with unique filenames.")
+    parser.add_argument("--remote-ip", required=True, help="Remote machine IP address")
+    parser.add_argument("--remote-port", required=True, type=int, help="Remote SSH port")
+    parser.add_argument("--remote-user", required=True, help="Username for remote SSH")
+    parser.add_argument("--remote-base-path", default="/workspace", help="Directory on remote machine containing checkpoints and lock file")
+    parser.add_argument("--local-dest", required=True, help="Local directory where checkpoints should be saved")
+    parser.add_argument("--rsa-key", required=True, help="Path to your RSA private key for authentication")
+    parser.add_argument("--poll-interval", type=float, default=10, help="Polling interval in seconds")
+    args = parser.parse_args()
+    # Construct remote file paths.
+    remote_checkpoint_a = f"{args.remote_base_path}/grads_a.safetensors"
+    remote_checkpoint_b = f"{args.remote_base_path}/grads_b.safetensors"
+    remote_inv_log_scalars = f"{args.remote_base_path}/log_scalars.safetensors"
+    remote_thresholds = f"{args.remote_base_path}/thresholds.safetensors"
+    remote_lock_file = f"{args.remote_base_path}/safetensors.lock"
+    print("Starting remote checkpoint monitor...")
+    while True:
+        # Check if the lock file exists on the remote machine.
+        if remote_file_exists(args.remote_user, args.remote_ip, args.remote_port, remote_lock_file, args.rsa_key):
+            print("New checkpoints detected. Downloading...")
+            # Generate unique filenames for each model.
+            local_checkpoint_a = get_unique_filename(args.local_dest, "grads_a")
+            local_checkpoint_b = get_unique_filename(args.local_dest, "grads_b")
+            local_inv_log_scalars = get_unique_filename(args.local_dest, "log_scalars")
+            local_thresholds = get_unique_filename(args.local_dest, "thresholds")
+            try:
+                # Download both checkpoints with the unique filenames.
+                download_file(args.remote_user, args.remote_ip, args.remote_port, remote_checkpoint_a, local_checkpoint_a, args.rsa_key)
+                download_file(args.remote_user, args.remote_ip, args.remote_port, remote_checkpoint_b, local_checkpoint_b, args.rsa_key)
+                download_file(args.remote_user, args.remote_ip, args.remote_port, remote_inv_log_scalars, local_inv_log_scalars, args.rsa_key)
+                # download_file(args.remote_user, args.remote_ip, args.remote_port, remote_thresholds, local_thresholds, args.rsa_key)
+            except subprocess.CalledProcessError as e:
+                print(f"Download error: {e}")
+                time.sleep(args.poll_interval)
+                continue
+            # After successful download, delete only the lock file on the remote side.
+            try:
+                while not delete_remote_lock(args.remote_user, args.remote_ip, args.remote_port, remote_lock_file, args.rsa_key):
+                    continue
+                print("Download complete. Checkpoints saved as:")
+                print(f"  {local_checkpoint_a}")
+                print(f"  {local_checkpoint_b}")
+                print("Remote lock file deleted.")
+            except subprocess.CalledProcessError as e:
+                print(f"Error deleting remote lock file: {e}")
+        else:
+            print("No checkpoints found.")
+        time.sleep(args.poll_interval)
+def remote_file_exists(remote_user, remote_host, remote_port, remote_path, rsa_key, timeout=10):
+    """Check if a file exists on the remote machine."""
+    cmd = [
+        "ssh",
+        "-i", rsa_key,
+        "-p", str(remote_port),
+        "-o", "StrictHostKeyChecking=no",
+        "-o", "UserKnownHostsFile=/dev/null",
+        f"{remote_user}@{remote_host}",
+        f"test -f {remote_path}"
+    ]
+    try:
+        result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=timeout)
+        if result.stdout: print("stdout", result.stdout.decode("utf-8"), end="")
+        if result.stderr: print("stderr", result.stderr.decode("utf-8"), file=sys.stderr, end="")
+        return result.returncode == 0
+    except subprocess.TimeoutExpired:
+        print(f"TimeoutExpired: SSH command to check {remote_path} on {remote_host} timed out after {timeout} seconds.")
+        return False
+def download_file(remote_user, remote_host, remote_port, remote_file, local_file, rsa_key, timeout=1200):
+    """Download a file from the remote machine using scp and save it with a specific name."""
+    cmd = [
+        "scp",
+        "-i", rsa_key,
+        "-P", str(remote_port),
+        "-o", "StrictHostKeyChecking=no",
+        "-o", "UserKnownHostsFile=/dev/null",
+        f"{remote_user}@{remote_host}:{remote_file}",
+        str(local_file)
+    ]
+    try:
+        result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=timeout)
+        if result.stdout: print("stdout", result.stdout.decode("utf-8"), end="")
+        if result.stderr: print("stderr", result.stderr.decode("utf-8"), file=sys.stderr, end="")
+        return result.returncode == 0
+    except subprocess.TimeoutExpired:
+        print(f"TimeoutExpired: SSH command to download {remote_file} on {remote_host} timed out after {timeout} seconds.")
+        return False
+def delete_remote_lock(remote_user, remote_host, remote_port, remote_lock_file, rsa_key, timeout=10):
+    """Delete the lock file on the remote machine."""
+    cmd = [
+        "ssh",
+        "-i", rsa_key,
+        "-p", str(remote_port),
+        "-o", "StrictHostKeyChecking=no",
+        "-o", "UserKnownHostsFile=/dev/null",
+        f"{remote_user}@{remote_host}",
+        f"rm -f {remote_lock_file}"
+    ]
+    try:
+        result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=timeout)
+        if result.stdout: print("stdout", result.stdout.decode("utf-8"), end="")
+        if result.stderr: print("stderr", result.stderr.decode("utf-8"), file=sys.stderr, end="")
+        return result.returncode == 0
+    except subprocess.TimeoutExpired:
+        print(f"TimeoutExpired: SSH command to delete {remote_lock_file} on {remote_host} timed out after {timeout} seconds.")
+        return False
+def get_unique_filename(local_dest, base_name):
+    """Generate a unique filename with a timestamp and return the full path."""
+    timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
+    filename = f"{base_name}_{timestamp}.safetensors"
+    return os.path.join(local_dest, filename)
+if __name__ == "__main__":
+    main()

joint_loss.py ADDED Viewed

	@@ -0,0 +1,510 @@

+import csv
+import dataclasses
+import subprocess
+from copy import deepcopy
+import itertools
+from concurrent.futures import ThreadPoolExecutor
+import pathlib
+from typing import List
+import diffusers
+import transformers
+import safetensors.torch
+import torch.utils.data
+from tqdm import tqdm
+from datetime import datetime
+import random
+import os
+import time
+from torch.utils.tensorboard import SummaryWriter
+torch.manual_seed(0)
+random.seed(0)
+LATENTS_OUTPUT_DIR = pathlib.Path("latents")
+CAPTIONS_OUTPUT_DIR = pathlib.Path("captions2")
+DANBOORU_ARTISTS_PATH = pathlib.Path("danbooru_artist.csv")
+E621_ARTISTS_PATH = pathlib.Path("e621_artist.csv")
+LOCK_FILE = "safetensors.lock"
+device = torch.device("cuda")
+dtype = torch.float16
+train_logger = SummaryWriter(f"logs/pony_scoreless_{datetime.now().strftime('%Y%m%d_%H%M%S')}")
+def accumulate_grads():
+    batch_size = 1
+    epochs = 1
+    tokenizer = create_tokenizer(device)
+    model_a = diffusers.StableDiffusionXLPipeline.from_single_file(
+        "NoobAI-XL-v1.1.safetensors",
+        torch_dtype=dtype,
+    )
+    delattr(model_a, "vae")
+    model_a.unet.to(device=device)
+    # model_a.unet.enable_xformers_memory_efficient_attention()
+    model_a.unet.enable_gradient_checkpointing()
+    model_a.text_encoder.to(device=device)
+    model_a.text_encoder.gradient_checkpointing_enable()
+    model_a.text_encoder_2.to(device=device)
+    model_a.text_encoder_2.gradient_checkpointing_enable()
+    model_a.text_encoder_combined = CombinedCLIPTextEncoder(model_a.text_encoder, model_a.text_encoder_2, batch_size)
+    model_b = diffusers.StableDiffusionXLPipeline.from_single_file(
+        "animagine-xl-4.0.safetensors",
+        torch_dtype=dtype,
+    )
+    delattr(model_b, "vae")
+    model_b.unet.to(device=device)
+    # model_b.unet.enable_xformers_memory_efficient_attention()
+    model_b.unet.enable_gradient_checkpointing()
+    model_b.text_encoder.to(device=device)
+    model_b.text_encoder.gradient_checkpointing_enable()
+    model_b.text_encoder_2.to(device=device)
+    model_b.text_encoder_2.gradient_checkpointing_enable()
+    model_b.text_encoder_combined = CombinedCLIPTextEncoder(model_b.text_encoder, model_b.text_encoder_2, batch_size)
+    model_a.unet.eval()
+    model_a.text_encoder.eval()
+    model_a.text_encoder_2.eval()
+    model_b.unet.eval()
+    model_b.text_encoder.eval()
+    model_b.text_encoder_2.eval()
+    # shared_stats = {}
+    # stats_lock = threading.Lock()
+    # # Two barriers for synchronization between two threads.
+    # grad_barrier1 = threading.Barrier(2)
+    # grad_barrier2 = threading.Barrier(2)
+    # def scaling_hook_factory(key, branch_id, target_scale=1.0):
+    #     nonlocal shared_stats, stats_lock, grad_barrier1, grad_barrier2
+    #     def scaling_hook(_module, _grad_input, grad_output):
+    #         """
+    #         A full-backward hook that:
+    #           1. Computes, for each non-None tensor in grad_output, its maximum absolute value.
+    #              We store these in a dictionary (keyed by output index).
+    #           2. Waits once until both threads have stored their local max values.
+    #           3. Computes, for each output index, the global maximum from both models.
+    #           4. Waits a second time to ensure synchronization before clearing the shared stats.
+    #           5. Scales each non-None output tensor independently using its computed scaling factor.
+    #              Outputs that are None are passed through unchanged.
+    #         """
+    #         # Step 1: Compute and store local maximums per output index.
+    #         print(f"backprop for {key}")
+    #         local_maxes = {}
+    #         for i, g in enumerate(grad_output):
+    #             if g is not None:
+    #                 local_maxes[i] = g.detach().abs().max().cpu().item()
+    #         with stats_lock:
+    #             shared_stats[f"{key}_{branch_id}"] = local_maxes
+    #         # Step 2: Wait until both threads have stored their values.
+    #         grad_barrier1.wait()
+    #         # Step 3: Compute the global maximum for each output index.
+    #         with stats_lock:
+    #             stats_a = shared_stats.get(f"{key}_a", {})
+    #             stats_b = shared_stats.get(f"{key}_b", {})
+    #             # Build a dictionary for global max per output index.
+    #             global_maxes = {}
+    #             for i in local_maxes.keys():
+    #                 assert i in stats_a and i in stats_b, key
+    #                 global_maxes[i] = max(stats_a[i], stats_b[i])
+    #         # Step 4: Wait again to ensure both threads have computed the global values.
+    #         barrier_val = grad_barrier2.wait()
+    #         # Let only one thread clear the shared stats.
+    #         if barrier_val == 0:
+    #             with stats_lock:
+    #                 shared_stats.pop(f"{key}_a")
+    #                 shared_stats.pop(f"{key}_b")
+    #         # Step 5: For each output tensor, compute a scaling factor and apply it.
+    #         scaled_outputs = []
+    #         for i, g in enumerate(grad_output):
+    #             if g is not None:
+    #                 global_max = global_maxes[i]
+    #                 # Compute scaling factor only if global_max is positive and below target_scale.
+    #                 if 0 < global_max < target_scale:
+    #                     g = g * (target_scale / global_max)
+    #                 scaled_outputs.append(g)
+    #             else:
+    #                 scaled_outputs.append(None)
+    #         return tuple(scaled_outputs)
+    #     return scaling_hook
+    # for model, branch_id in zip((model_a, model_b), ("a", "b")):
+    #     for k, v in get_modules(model):
+    #         if k.endswith("transformer_blocks") or k.endswith("encoder.layers"):
+    #             for i, module in enumerate(v):
+    #                 module.register_full_backward_hook(scaling_hook_factory(f"{k}.{i}", branch_id))
+    scheduler = create_scheduler(device)
+    data_loader = get_data_loader(tokenizer, batch_size)
+    total_steps = 0
+    log_scalars_a = {}
+    log_scalars_b = {}
+    log_scalars_sync = {}
+    n1 = torch.tensor(-1, device=device, dtype=torch.long)
+    ldexp_offset = torch.tensor(20, device=device, dtype=torch.long)
+    def create_hook(param, k, log_scalars):
+        param.grad = torch.zeros_like(param)
+        log_scalars[k] = ldexp_offset.clone()
+        def hook(grad):
+            nonlocal param, log_scalars, k
+            while True:
+                new_grad = param.grad + grad.abs().ldexp(log_scalars[k])
+                if not new_grad.isfinite().all():  # overflow
+                    log_scalars[k] -= 1
+                    param.grad.ldexp_(n1)
+                else:
+                    break
+            param.grad.copy_(new_grad)
+            return param.grad
+        return hook
+    for model, log_scalars in ((model_a, log_scalars_a), (model_b, log_scalars_b)):
+        for k, v in get_params(model):
+            v.register_hook(create_hook(v, k, log_scalars))
+    # for model, path in ((model_a, "grads_a.safetensors"), (model_b, "grads_b.safetensors")):
+    #     with safetensors.safe_open(path, "pt") as f:
+    #         for k, v in get_params(model):
+    #             if k in f.keys():
+    #                 v.grad = f.get_tensor(k).to(v)
+    noisy_latents = timesteps = time_ids = None
+    def get_pred(args):
+        nonlocal noisy_latents, timesteps, time_ids
+        model, tokens = args
+        txt = model.text_encoder_combined(tokens[0])
+        return model.unet(
+            noisy_latents,
+            timesteps,
+            encoder_hidden_states=txt["conds"],
+            added_cond_kwargs={
+                "text_embeds": txt["pooled"],
+                "time_ids": time_ids,
+            },
+        ).sample
+    params = list(v for k, v in itertools.chain(get_params(model_a), get_params(model_b)))
+    with ThreadPoolExecutor(max_workers=2) as worker:
+        for epoch_i in range(epochs):
+            for step_i, (latent_infos, tokens_a, tokens_b, post_ids) in enumerate(tqdm(data_loader)):
+                latents = torch.cat([latent_info["latent"] for latent_info in latent_infos], dim=0).to(device=device, dtype=dtype)
+                crop_hw = torch.stack([latent_info["crop_hw"] for latent_info in latent_infos]).to(device=device)
+                orig_hw = torch.stack([latent_info["orig_hw"] for latent_info in latent_infos]).to(device=device)
+                noise, noisy_latents, timesteps = get_noise_noisy_latents_and_timesteps(scheduler, latents)
+                time_ids = get_add_time_ids(orig_hw, crop_hw)
+                # if step_i < 1000:
+                #     total_steps += batch_size
+                #     continue
+                pred_a, pred_b = worker.map(get_pred, ((model_a, tokens_a), (model_b, tokens_b)))
+                mse = torch.nn.functional.mse_loss(pred_a, pred_b, reduction="none").flatten(start_dim=1).mean(dim=-1)
+                loss = (mse / mse.detach()).mean()
+                train_logger.add_scalar("grads/loss", loss.item(), total_steps)
+                train_logger.add_scalar("grads/loss_raw", mse.mean().item(), total_steps)
+                train_logger.add_scalar("grads/timestep", timesteps[0].item(), total_steps)
+                torch.autograd.grad(loss, params, retain_graph=False, allow_unused=True)  # calls backward hooks
+                for (k, v_a), (k_b, v_b) in zip(get_params(model_a), get_params(model_b)):
+                    assert k == k_b
+                    if v_a.grad is not None and v_b.grad is not None:
+                        while log_scalars_a[k] > log_scalars_b[k]:
+                            log_scalars_a[k] -= 1
+                            v_a.grad.ldexp_(n1)
+                        while log_scalars_b[k] > log_scalars_a[k]:
+                            log_scalars_b[k] -= 1
+                            v_b.grad.ldexp_(n1)
+                        log_scalars_sync[k] = log_scalars_a[k]
+                if (step_i + 1) % 10 == 0:
+                    train_logger.add_scalar("grads/max_a", max(v.grad.max().item() for k, v in get_params(model_a) if v.grad is not None), total_steps)
+                    train_logger.add_scalar("grads/max_b", max(v.grad.max().item() for k, v in get_params(model_b) if v.grad is not None), total_steps)
+                if (step_i + 1) % 1000 == 0:
+                    save_grads(model_a, "grads_a.safetensors", first=True)
+                    safetensors.torch.save_file(log_scalars_sync, "log_scalars.safetensors")
+                    save_grads(model_b, "grads_b.safetensors", last=True)
+                total_steps += batch_size
+def get_modules(model):
+    return itertools.chain(
+        prefix_iter(model.unet.named_modules(), "unet."),
+        prefix_iter(model.text_encoder.named_modules(), "text_encoder."),
+        prefix_iter(model.text_encoder_2.named_modules(), "text_encoder_2."),
+    )
+def get_params(model):
+    return itertools.chain(
+        prefix_iter(model.unet.named_parameters(), "unet."),
+        prefix_iter(model.text_encoder.named_parameters(), "text_encoder."),
+        prefix_iter(model.text_encoder_2.named_parameters(), "text_encoder_2."),
+    )
+def prefix_iter(item_iter, prefix):
+    return ((prefix + k, v) for k, v in item_iter)
+def save_grads(model, path, first=False, last=False):
+    if first:
+        wait_for_lock_removal()
+    safetensors.torch.save_file(
+        {k: v.grad.cpu().contiguous() for k, v in get_params(model) if v.grad is not None},
+        path,
+    )
+    if last:
+        # Create a lock file to signal that new checkpoints have been saved
+        with open(LOCK_FILE, "w") as f:
+            f.write("pending download")
+        print("Checkpoint pair saved, lock file created.")
+def wait_for_lock_removal(poll_interval=5):
+    """Wait until the lock file is removed by the local download script."""
+    while os.path.exists(LOCK_FILE):
+        time.sleep(poll_interval)
+def create_scheduler(device: torch.device):
+    scheduler = diffusers.DDPMScheduler(
+        beta_start=0.00085,
+        beta_end=0.012,
+        beta_schedule="scaled_linear",
+        num_train_timesteps=1000,
+        clip_sample=False,
+    )
+    inv_snr = ((1-scheduler.alphas_cumprod) / scheduler.alphas_cumprod).to(device)
+    scheduler.inv_snr = inv_snr
+    scheduler.inv_snr_weights = inv_snr / inv_snr.sum()
+    return scheduler
+def debiased_loss_scaling(timesteps, noise_scheduler):
+    return noise_scheduler.inv_snr[timesteps]
+def get_noise_noisy_latents_and_timesteps(scheduler, latents):
+    batch_size = latents.shape[0]
+    noise = torch.randn_like(latents, device=latents.device)
+    timesteps = torch.multinomial(scheduler.inv_snr_weights, batch_size)
+    noisy_latents = scheduler.add_noise(latents, noise, timesteps)
+    return noise, noisy_latents, timesteps
+def get_add_time_ids(original_size, crops_coords_top_left):
+    add_time_ids = torch.cat([
+        original_size,
+        crops_coords_top_left,
+        torch.tensor([[1024]*2], device=original_size.device).expand(len(original_size), -1),
+    ], dim=1)
+    return add_time_ids
+def get_data_loader(tokenizer, batch_size: int):
+    return torch.utils.data.DataLoader(
+        PromptDataset(tokenizer),
+        batch_size=batch_size,
+        shuffle=True,
+        collate_fn=lambda x: zip(*x),
+    )
+@dataclasses.dataclass
+class ArtistScore:
+    artist_tag: str
+    count: int
+class PromptDataset(torch.utils.data.Dataset):
+    def __init__(self, tokenizer):
+        self.tokenizer = tokenizer
+        self.latent_paths = list(LATENTS_OUTPUT_DIR.iterdir())
+        with open(DANBOORU_ARTISTS_PATH, "r", encoding='utf-8') as f:
+            reader = csv.DictReader(f)
+            self.b_artists = [ArtistScore(r["trigger"], int(r["count"])) for r in reader if r["artist"] != "banned_artist"]
+            self.b_artists.sort(key=lambda t: t.count, reverse=True)
+            self.b_artist_scores = torch.tensor(list(map(lambda t: t.count, self.b_artists)), device=device, dtype=torch.float32)
+            self.b_artist_scores /= self.b_artist_scores.sum()
+        with open(E621_ARTISTS_PATH, "r", encoding='utf-8') as f:
+            reader = csv.DictReader(f,)
+            self.a_artists = self.b_artists + [ArtistScore(r["trigger"], int(r["count"])) for r in reader if r["artist"] not in ["conditional_dnp", "avoid_posting", "unknown_artist", "third-party_edit", "sound_warning", "anonymous_artist"]]
+            self.a_artists.sort(key=lambda t: t.count, reverse=True)
+            self.a_artist_scores = torch.tensor(list(map(lambda t: t.count, self.a_artists)), device=device, dtype=torch.float32)
+            self.a_artist_scores /= self.a_artist_scores.sum()
+        self.a_prefix = "masterpiece, best quality, newest, absurdres, highres, safe, "
+        self.b_suffix = ", masterpiece, high score, great score, absurdres"
+    def __len__(self):
+        return len(self.latent_paths)
+    def __getitem__(self, item):
+        post_id = self.latent_paths[item].stem
+        latent = safetensors.torch.load_file(LATENTS_OUTPUT_DIR / f"{post_id}.safetensors", device=str(device))
+        caption = (CAPTIONS_OUTPUT_DIR / f"{post_id}.txt").read_text()
+        caption_a = self.a_prefix + caption
+        caption_b = caption + self.b_suffix
+        if item % 2 == 0:
+            artist_a = self.a_artists[torch.multinomial(self.a_artist_scores, 1).item()]
+            caption_a = artist_a.artist_tag + ", " + caption_a
+        else:
+            artist_b = self.b_artists[torch.multinomial(self.b_artist_scores, 1).item()]
+            caption_b = artist_b.artist_tag + ", " + caption_b
+        tokens_a = self.tokenizer.chunk_tokens(self.tokenizer([caption_a.replace("),", ") ,")]))
+        tokens_b = self.tokenizer.chunk_tokens(self.tokenizer([caption_b.replace("),", ") ,")]))
+        return latent, tokens_a, tokens_b, post_id
+class CombinedCLIPTextEncoder(torch.nn.Module):
+    def __init__(self, clip_l, clip_g, batch_size):
+        super().__init__()
+        assert batch_size == 1
+        self.clip_l = clip_l
+        self.clip_g = clip_g
+    def forward(self, tokens):
+        tokens_clip_l = tokens["clip_l"].copy()
+        del tokens_clip_l["prompt_starts"]
+        tokens_clip_g = tokens["clip_g"].copy()
+        clip_g_starts = tokens_clip_g.pop("prompt_starts")
+        clip_l_encoded = self.clip_l(**tokens_clip_l, output_hidden_states=True, return_dict=True)
+        clip_g_encoded = self.clip_g(**tokens_clip_g, output_hidden_states=True, return_dict=True)
+        combined_encoded = torch.cat([clip_l_encoded["hidden_states"][-2], clip_g_encoded["hidden_states"][-2]], dim=-1)
+        combined_encoded_reshape = combined_encoded.reshape(1, -1, 2048)
+        return {
+            "conds": combined_encoded_reshape,
+            "pooled": clip_g_encoded.text_embeds[clip_g_starts],
+        }
+def create_tokenizer(device: torch.device):
+    tokenizer_l = transformers.CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
+    tokenizer_g = transformers.CLIPTokenizer.from_pretrained("laion/CLIP-ViT-g-14-laion2B-s34B-b88K")
+    return CombinedCLIPTokenizer(tokenizer_l, tokenizer_g, device)
+class CombinedCLIPTokenizer(torch.nn.Module):
+    comma_token = 267
+    def __init__(self, tokenizer_l, tokenizer_g, output_device: torch.device):
+        super().__init__()
+        self.tokenizer_l = tokenizer_l
+        self.tokenizer_g = tokenizer_g
+        self.output_device = output_device
+    def forward(self, prompts: List[str]) -> dict:
+        tokens_l = self.tokenizer_l(prompts, add_special_tokens=False)
+        return {
+            "clip_l": tokens_l,
+            "clip_g": deepcopy(tokens_l),
+        }
+    def chunk_tokens(self, tokens: dict):
+        return {
+            "clip_l": self._chunk_tokens_impl(self.tokenizer_l, tokens["clip_l"]),
+            "clip_g": self._chunk_tokens_impl(self.tokenizer_g, tokens["clip_g"]),
+        }
+    def _chunk_tokens_impl(self, tokenizer, tokens: dict):
+        input_ids = []
+        attention_masks = []
+        chunk_counts = []
+        for prompt, mask in zip(tokens["input_ids"], tokens["attention_mask"]):
+            last_comma = 0
+            current_chunk = []
+            chunks = []
+            chunks_attn = []
+            def next_chunk():
+                nonlocal current_chunk
+                current_chunk = [tokenizer.bos_token_id] + current_chunk + [tokenizer.eos_token_id]
+                num_tokens = len(current_chunk)
+                current_chunk.extend([tokenizer.pad_token_id] * (77 - num_tokens))
+                chunks.append(current_chunk)
+                current_chunk = []
+                chunks_attn.append([1] * num_tokens + [0] * (77 - num_tokens))
+            for token_i, token in enumerate(prompt):
+                is_last_token = token_i == len(prompt) - 1
+                seq_suffix = prompt[last_comma:token_i + int(is_last_token)]
+                if token == self.comma_token or is_last_token:
+                    if len(current_chunk) + len(seq_suffix) > 77 - 2:  # leave space for bos and eos
+                        next_chunk()
+                        seq_suffix = prompt[last_comma+1:token_i + int(is_last_token)]  # remove leading comma
+                    # can always append, sequences without commas will never be longer than 77 tokens
+                    current_chunk.extend(seq_suffix)
+                    last_comma = token_i
+            if current_chunk or not chunks:
+                next_chunk()
+            chunk_counts.append(len(chunks))
+            input_ids.extend(chunks)
+            attention_masks.extend(chunks_attn)
+        return {
+            "input_ids": torch.tensor(input_ids, device=self.output_device),
+            "attention_mask": torch.tensor(attention_masks, device=self.output_device),
+            "prompt_starts": torch.tensor([0] + chunk_counts[:-1], device=self.output_device).cumsum(dim=0),
+        }
+def shutdown_machine():
+    """Shutdown the machine. Adjust the command as necessary for your environment."""
+    wait_for_lock_removal()
+    print("All checkpoints have been downloaded. Shutting down the machine.")
+    try:
+        subprocess.run("runpodctl stop pod $RUNPOD_POD_ID", shell=True, check=True)
+    except Exception as e:
+        print(f"Error shutting down: {e}")
+if __name__ == "__main__":
+    accumulate_grads()
+    shutdown_machine()

prepare_dataset.py ADDED Viewed

	@@ -0,0 +1,320 @@

+import pathlib
+import random
+from copy import deepcopy
+from typing import List
+import diffusers
+import torch
+import safetensors.torch
+import transformers
+from PIL import Image
+from diffusers import AutoencoderKL, StableDiffusionXLPipeline
+import torchvision.transforms as T
+from tqdm import tqdm
+from concurrent.futures import ThreadPoolExecutor, as_completed, wait, FIRST_COMPLETED
+import threading
+import dataclasses
+devices = [torch.device("cuda:0"), torch.device("cuda:1"), torch.device("cuda:2")]
+dtypes = [torch.bfloat16, torch.float32, torch.float32]
+VAE_PATH = "KBlueLeaf/EQ-SDXL-VAE"
+SDXL_PATH = "/home/ljleb/sd/models/Stable-diffusion/noobaiXLNAIXL_epsilonPred11Version.safetensors"
+IMAGES_DIR = pathlib.Path("/mnt/data/shared/danbooru")
+LATENT_DIR = pathlib.Path("/mnt/data/shared/danbooru-latent")
+@dataclasses.dataclass
+class Worker:
+    device: torch.device
+    dtype: torch.dtype
+    vae_w = None
+    sdxl = None
+    tokenizer = None
+    def __post_init__(self):
+        self.vae_w = AutoencoderKL.from_pretrained(VAE_PATH, torch_dtype=self.dtype).to(self.device)
+        self.vae_w.eval()
+        self.sdxl = StableDiffusionXLPipeline.from_single_file(SDXL_PATH, torch_dtype=self.dtype).to(self.device)
+        self.sdxl.unet.eval()
+        self.sdxl.vae.eval()
+        self.sdxl.text_encoder.eval()
+        self.sdxl.text_encoder_2.eval()
+        self.sdxl.text_encoder_combined = CombinedCLIPTextEncoder(self.sdxl.text_encoder, self.sdxl.text_encoder_2, self.device)
+        self.tokenizer = create_tokenizer(self.device)
+        self.scheduler = create_scheduler()
+def main():
+    images = list(IMAGES_DIR.iterdir())
+    LATENT_DIR.mkdir(exist_ok=True)
+    workers = [
+        Worker(device, dtype)
+        for device, dtype in zip(devices, dtypes)
+    ]
+    with ThreadPoolExecutor(max_workers=len(workers)) as executor:
+        futures = {}
+        for image in tqdm(images):
+            if len(futures) >= len(workers):
+                completed_futures, _ = wait(list(futures.values()), return_when=FIRST_COMPLETED)
+                for future in completed_futures:
+                    if future.exception() is not None:
+                        for future_to_cancel in futures.values():
+                            future_to_cancel.cancel()
+                        raise future.exception()
+                    else:
+                        future.result()
+                futures = {
+                    k: v for k, v in futures.items()
+                    if v not in completed_futures
+                }
+            for worker in workers:
+                if worker.device not in futures:
+                    futures[worker.device] = executor.submit(prepare_image, worker, image)
+                    break
+        for future in futures.values():
+            if future.exception() is not None:
+                for future_to_cancel in futures.values():
+                    future_to_cancel.cancel()
+                raise future.exception()
+            else:
+                future.result()
+@torch.no_grad()
+def prepare_image(worker: Worker, img_path: pathlib.Path):
+    # We'll define a transform to convert an image to a tensor
+    to_tensor = T.Compose([
+        T.ToTensor(),
+        T.Lambda(lambda t: t*2 - 1)
+    ])
+    # w_0_offset = torch.tensor([-3.8846, -1.3187, 0.8009, 0.9180], device=device, dtype=dtype)
+    # w_0_scale = torch.tensor([10.0298, 6.8674, 7.2104, 5.5948], device=device, dtype=dtype)
+    # Iterate over images in directory
+    if not img_path.is_file():
+        return
+    if img_path.suffix.lower() not in [".jpg", ".jpeg", ".png", ".webp", ".bmp", ".tiff"]:
+        return
+    # Attempt to open image
+    try:
+        img = Image.open(img_path).convert("RGB")
+    except Exception as e:
+        print(f"Error loading image {img_path.name}: {e}")
+        return
+    # Read the caption from the matching .txt file (if it exists)
+    txt_path = img_path.with_suffix(img_path.suffix + ".txt")
+    if not txt_path.is_file():
+        print(f"No caption file for {img_path.name}, skipping.")
+        return
+    caption = txt_path.read_text(encoding="utf-8").strip()
+    if not caption:
+        print(f"Empty caption for {img_path.name}, skipping.")
+        return
+    out_path = LATENT_DIR / (img_path.stem + ".safetensors")
+    if out_path.exists():
+        return
+    caption = caption.replace("\n", " , ").replace("_", " ")
+    width, height = img.size
+    orig_pixels = width * height
+    target_pixels = 1024 * 1024
+    if orig_pixels > target_pixels:
+        scale = (target_pixels / float(orig_pixels)) ** 0.5
+        width = int(round(width * scale))
+        height = int(round(height * scale))
+        img = img.resize((width, height), Image.Resampling.LANCZOS)
+    tokens_raw = worker.tokenizer([caption])
+    tokens = worker.tokenizer.chunk_tokens(tokens_raw)
+    # Convert image to tensor on device
+    img_tensor = to_tensor(img).unsqueeze(0).to(device=worker.device, dtype=worker.dtype)
+    # Encode the image with each VAE
+    with torch.no_grad():
+        latents_w_unnorm = worker.vae_w.encode(img_tensor).latent_dist.sample()
+        latents_z = worker.sdxl.vae.encode(img_tensor).latent_dist.sample() * 0.13025
+    # Sample noise and a random timestep
+    noise, noisy_latents_z, timesteps = get_noise_noisy_latents_and_timesteps(worker.scheduler, latents_z)
+    time_ids = get_add_time_ids(height, width, worker.device)
+    embeds = worker.sdxl.text_encoder_combined(tokens)
+    epsilon_pred = get_pred(worker.sdxl, noisy_latents_z, embeds, timesteps, time_ids)
+    encoded = {
+        "timesteps": timesteps,
+        "hw": torch.tensor([[height, width]], dtype=torch.long),
+        "w_0_unnorm": latents_w_unnorm,
+        "z_0": latents_z,
+        "epsilon_pred": epsilon_pred,
+        "epsilon": noise,
+        "conds": embeds["conds"],
+        "pooled": embeds["pooled"],
+    }
+    safetensors.torch.save_file(encoded, str(out_path))
+def get_add_time_ids(width, height, device):
+    original_size = torch.tensor([[width, height]], device=device)
+    add_time_ids = torch.cat([
+        original_size,
+        torch.tensor([[0]*2], device=device).expand(len(original_size), -1),
+        original_size,
+    ], dim=1)
+    return add_time_ids
+def get_pred(sdxl, noisy_latents, embeds, timesteps, time_ids):
+    return sdxl.unet(
+        noisy_latents,
+        timesteps,
+        encoder_hidden_states=embeds["conds"],
+        added_cond_kwargs={
+            "text_embeds": embeds["pooled"],
+            "time_ids": time_ids,
+        },
+    ).sample
+def get_noise_noisy_latents_and_timesteps(scheduler, latents):
+    noise = torch.randn_like(latents, device=latents.device)
+    batch_size = latents.shape[0]
+    timesteps = torch.randint(0, 999, (batch_size,), device=latents.device)
+    noisy_latents = scheduler.add_noise(latents, noise, timesteps)
+    return noise, noisy_latents, timesteps
+def create_scheduler():
+    scheduler = diffusers.DDPMScheduler(
+        beta_start=0.00085,
+        beta_end=0.012,
+        beta_schedule="scaled_linear",
+        num_train_timesteps=1000,
+        clip_sample=False,
+    )
+    return scheduler
+class CombinedCLIPTextEncoder(torch.nn.Module):
+    def __init__(self, clip_l, clip_g, device):
+        super().__init__()
+        self.clip_l = clip_l.to(device=device)
+        self.clip_g = clip_g.to(device=device)
+        self.device = device
+    def forward(self, tokens_batch):
+        res = {
+            "conds": torch.tensor([], device=self.device).view(0, 1, 1),
+            "pooled": torch.tensor([], device=self.device).view(0, 1, 1),
+        }
+        tokens_clip_l = tokens_batch["clip_l"].copy()
+        del tokens_clip_l["prompt_starts"]
+        tokens_clip_g = tokens_batch["clip_g"].copy()
+        clip_g_starts = tokens_clip_g.pop("prompt_starts")
+        clip_l_encoded = self.clip_l(**tokens_clip_l, output_hidden_states=True, return_dict=True)
+        clip_g_encoded = self.clip_g(**tokens_clip_g, output_hidden_states=True, return_dict=True)
+        combined_encoded = torch.cat([clip_l_encoded["hidden_states"][-2], clip_g_encoded["hidden_states"][-2]], dim=-1)
+        combined_encoded_reshape = combined_encoded.reshape(1, -1, 2048)
+        res["conds"] = combined_encoded_reshape
+        res["pooled"] = clip_g_encoded.text_embeds[clip_g_starts]
+        return res
+def create_tokenizer(device: torch.device):
+    tokenizer_l = transformers.CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
+    tokenizer_g = transformers.CLIPTokenizer.from_pretrained("laion/CLIP-ViT-g-14-laion2B-s34B-b88K")
+    return CombinedCLIPTokenizer(tokenizer_l, tokenizer_g, device)
+class CombinedCLIPTokenizer(torch.nn.Module):
+    comma_token = 267
+    def __init__(self, tokenizer_l, tokenizer_g, output_device: torch.device):
+        super().__init__()
+        self.tokenizer_l = tokenizer_l
+        self.tokenizer_g = tokenizer_g
+        self.output_device = output_device
+    def forward(self, prompts: List[str]) -> dict:
+        tokens_l = self.tokenizer_l(prompts, add_special_tokens=False)
+        return {
+            "clip_l": tokens_l,
+            "clip_g": deepcopy(tokens_l),
+        }
+    def chunk_tokens(self, tokens: dict):
+        return {
+            "clip_l": self._chunk_tokens_impl(self.tokenizer_l, tokens["clip_l"]),
+            "clip_g": self._chunk_tokens_impl(self.tokenizer_g, tokens["clip_g"]),
+        }
+    def _chunk_tokens_impl(self, tokenizer, tokens: dict):
+        input_ids = []
+        attention_masks = []
+        chunk_counts = []
+        for prompt, mask in zip(tokens["input_ids"], tokens["attention_mask"]):
+            last_comma = 0
+            current_chunk = []
+            chunks = []
+            chunks_attn = []
+            def next_chunk():
+                nonlocal current_chunk
+                current_chunk = [tokenizer.bos_token_id] + current_chunk + [tokenizer.eos_token_id]
+                num_tokens = len(current_chunk)
+                current_chunk.extend([tokenizer.pad_token_id] * (77 - num_tokens))
+                chunks.append(current_chunk)
+                current_chunk = []
+                chunks_attn.append([1] * num_tokens + [0] * (77 - num_tokens))
+            for token_i, token in enumerate(prompt):
+                is_last_token = token_i == len(prompt) - 1
+                seq_suffix = prompt[last_comma:token_i + int(is_last_token)]
+                if token == self.comma_token or is_last_token:
+                    if len(current_chunk) + len(seq_suffix) > 77 - 2:  # leave space for bos and eos
+                        next_chunk()
+                        seq_suffix = prompt[last_comma+1:token_i + int(is_last_token)]  # remove leading comma
+                    # can always append, sequences without commas will never be longer than 77 tokens
+                    current_chunk.extend(seq_suffix)
+                    last_comma = token_i
+            if current_chunk or not chunks:
+                next_chunk()
+            chunk_counts.append(len(chunks))
+            input_ids.extend(chunks)
+            attention_masks.extend(chunks_attn)
+        return {
+            "input_ids": torch.tensor(input_ids, device=self.output_device),
+            "attention_mask": torch.tensor(attention_masks, device=self.output_device),
+            "prompt_starts": torch.tensor([0] + chunk_counts[:-1], device=self.output_device).cumsum(dim=0),
+        }
+if __name__ == "__main__":
+    main()