Spaces:
				
			
			
	
			
			
					
		Running
		
	
	
	
			
			
	
	
	
	
		
		
					
		Running
		
	
		Jae-Won Chung
		
	commited on
		
		
					Commit 
							
							·
						
						c97bae1
	
1
								Parent(s):
							
							abd945c
								
Updated diffusion benchmark and data
Browse filesThis view is limited to 50 files because it contains too many changes.  
							See raw diff
- .gitignore +1 -1
- benchmark/diffusion/image-to-video/pegasus/A100/queue_1gpu.yaml +4 -4
- benchmark/diffusion/image-to-video/pegasus/H100/queue_1gpu.yaml +4 -4
- benchmark/diffusion/image-to-video/scripts/aggregate_leaderboard_models.py +1 -1
- benchmark/diffusion/image-to-video/scripts/benchmark_one_datapoint.py +26 -15
- benchmark/diffusion/image-to-video/scripts/benchmark_one_model.py +41 -35
- benchmark/diffusion/image-to-video/sharegpt4video/extract_first_frame.py +1 -1
- benchmark/diffusion/text-to-image/pegasus/A100/queue_1gpu.yaml +1 -1
- benchmark/diffusion/text-to-image/scripts/aggregate_leaderboard_models.py +2 -1
- benchmark/diffusion/text-to-image/scripts/benchmark_one_datapoint.py +49 -8
- benchmark/diffusion/text-to-image/scripts/benchmark_one_model.py +20 -18
- benchmark/diffusion/text-to-video/pegasus/A100/queue_1gpu.yaml +1 -1
- benchmark/diffusion/text-to-video/pegasus/H100/queue_1gpu.yaml +1 -1
- benchmark/diffusion/text-to-video/scripts/aggregate_leaderboard_models.py +2 -1
- benchmark/diffusion/text-to-video/scripts/benchmark_one_datapoint.py +11 -11
- benchmark/diffusion/text-to-video/scripts/benchmark_one_model.py +37 -35
- data/diffusion/image-to-video/A100-SXM4-40GB/ali-vilab/i2vgen-xl/bs1+steps25+frames16.json +2 -2
- data/diffusion/image-to-video/A100-SXM4-40GB/ali-vilab/i2vgen-xl/bs2+steps25+frames16.json +2 -2
- data/diffusion/image-to-video/A100-SXM4-40GB/stabilityai/stable-video-diffusion-img2vid-xt/bs1+steps25+frames25.json +2 -2
- data/diffusion/image-to-video/A100-SXM4-40GB/stabilityai/stable-video-diffusion-img2vid-xt/bs2+steps25+frames25.json +9 -0
- data/diffusion/image-to-video/A100-SXM4-40GB/stabilityai/stable-video-diffusion-img2vid-xt/bs3+steps25+frames25.json +9 -0
- data/diffusion/image-to-video/A100-SXM4-40GB/stabilityai/stable-video-diffusion-img2vid/bs1+steps25+frames14.json +2 -2
- data/diffusion/image-to-video/A100-SXM4-40GB/stabilityai/stable-video-diffusion-img2vid/bs2+steps25+frames14.json +9 -0
- data/diffusion/image-to-video/A100-SXM4-40GB/stabilityai/stable-video-diffusion-img2vid/bs3+steps25+frames14.json +9 -0
- data/diffusion/image-to-video/H100 80GB HBM3/ali-vilab/i2vgen-xl/bs1+steps25+frames16.json +2 -2
- data/diffusion/image-to-video/H100 80GB HBM3/ali-vilab/i2vgen-xl/bs2+steps25+frames16.json +2 -2
- data/diffusion/image-to-video/H100 80GB HBM3/stabilityai/stable-video-diffusion-img2vid-xt/bs1+steps25+frames25.json +2 -2
- data/diffusion/image-to-video/H100 80GB HBM3/stabilityai/stable-video-diffusion-img2vid-xt/bs2+steps25+frames25.json +9 -0
- data/diffusion/image-to-video/H100 80GB HBM3/stabilityai/stable-video-diffusion-img2vid-xt/bs3+steps25+frames25.json +9 -0
- data/diffusion/image-to-video/H100 80GB HBM3/stabilityai/stable-video-diffusion-img2vid/bs1+steps25+frames14.json +2 -2
- data/diffusion/image-to-video/H100 80GB HBM3/stabilityai/stable-video-diffusion-img2vid/bs2+steps25+frames14.json +9 -0
- data/diffusion/image-to-video/H100 80GB HBM3/stabilityai/stable-video-diffusion-img2vid/bs3+steps25+frames14.json +9 -0
- data/diffusion/text-to-image/A100-SXM4-40GB/kandinsky-community/kandinsky-2-2-decoder/bs1+steps25.json +2 -2
- data/diffusion/text-to-image/A100-SXM4-40GB/kandinsky-community/kandinsky-2-2-decoder/bs16+steps25.json +2 -2
- data/diffusion/text-to-image/A100-SXM4-40GB/kandinsky-community/kandinsky-2-2-decoder/bs2+steps25.json +2 -2
- data/diffusion/text-to-image/A100-SXM4-40GB/kandinsky-community/kandinsky-2-2-decoder/bs32+steps25.json +8 -0
- data/diffusion/text-to-image/A100-SXM4-40GB/kandinsky-community/kandinsky-2-2-decoder/bs4+steps25.json +2 -2
- data/diffusion/text-to-image/A100-SXM4-40GB/kandinsky-community/kandinsky-2-2-decoder/bs8+steps25.json +2 -2
- data/diffusion/text-to-image/A100-SXM4-40GB/kandinsky-community/kandinsky-3/bs1+steps25.json +2 -2
- data/diffusion/text-to-image/A100-SXM4-40GB/kandinsky-community/kandinsky-3/bs2+steps25.json +2 -2
- data/diffusion/text-to-image/A100-SXM4-40GB/prompthero/openjourney-v4/bs1+steps25.json +2 -2
- data/diffusion/text-to-image/A100-SXM4-40GB/prompthero/openjourney-v4/bs16+steps25.json +2 -2
- data/diffusion/text-to-image/A100-SXM4-40GB/prompthero/openjourney-v4/bs2+steps25.json +2 -2
- data/diffusion/text-to-image/A100-SXM4-40GB/prompthero/openjourney-v4/bs32+steps25.json +8 -0
- data/diffusion/text-to-image/A100-SXM4-40GB/prompthero/openjourney-v4/bs4+steps25.json +2 -2
- data/diffusion/text-to-image/A100-SXM4-40GB/prompthero/openjourney-v4/bs64+steps25.json +8 -0
- data/diffusion/text-to-image/A100-SXM4-40GB/prompthero/openjourney-v4/bs8+steps25.json +2 -2
- data/diffusion/text-to-image/A100-SXM4-40GB/segmind/SSD-1B/bs1+steps25.json +2 -2
- data/diffusion/text-to-image/A100-SXM4-40GB/segmind/SSD-1B/bs2+steps25.json +2 -2
- data/diffusion/text-to-image/A100-SXM4-40GB/segmind/SSD-1B/bs4+steps25.json +2 -2
    	
        .gitignore
    CHANGED
    
    | @@ -18,4 +18,4 @@ build/ | |
| 18 |  | 
| 19 | 
             
            # Data files
         | 
| 20 | 
             
            *.log
         | 
| 21 | 
            -
             | 
|  | |
| 18 |  | 
| 19 | 
             
            # Data files
         | 
| 20 | 
             
            *.log
         | 
| 21 | 
            +
            figures/
         | 
    	
        benchmark/diffusion/image-to-video/pegasus/A100/queue_1gpu.yaml
    CHANGED
    
    | @@ -1,6 +1,6 @@ | |
| 1 | 
             
            - command:
         | 
| 2 | 
            -
                - "python scripts/benchmark_one_model.py {{ model }} --result-root results/joule --dataset-path sharegpt4video/sharegpt4video_100.json --gpu-ids {{ gpu }} --batch-sizes  | 
| 3 | 
             
              model:
         | 
| 4 | 
            -
                - '--model ali-vilab/i2vgen-xl --num-frames 16 --add-text-prompt'
         | 
| 5 | 
            -
                - '--model stabilityai/stable-video-diffusion-img2vid --num-frames 14'
         | 
| 6 | 
            -
                - '--model stabilityai/stable-video-diffusion-img2vid-xt --num-frames 25'
         | 
|  | |
| 1 | 
             
            - command:
         | 
| 2 | 
            +
                - "python scripts/benchmark_one_model.py {{ model }} --result-root results/joule --dataset-path sharegpt4video/sharegpt4video_100.json --gpu-ids {{ gpu }} --batch-sizes 4 3 2 1 --power-limits 400 --num-inference-steps 1 2 4 8 16 25 30 40 50"
         | 
| 3 | 
             
              model:
         | 
| 4 | 
            +
                - '--model ali-vilab/i2vgen-xl --num-frames 16 --add-text-prompt --width 1280 --height 720'
         | 
| 5 | 
            +
                - '--model stabilityai/stable-video-diffusion-img2vid --num-frames 14 --width 1024 --height 576'
         | 
| 6 | 
            +
                - '--model stabilityai/stable-video-diffusion-img2vid-xt --num-frames 25 --width 1024 --height 576'
         | 
    	
        benchmark/diffusion/image-to-video/pegasus/H100/queue_1gpu.yaml
    CHANGED
    
    | @@ -1,6 +1,6 @@ | |
| 1 | 
             
            - command:
         | 
| 2 | 
            -
                - "python scripts/benchmark_one_model.py {{ model }} --result-root results/joule --dataset-path sharegpt4video/ | 
| 3 | 
             
              model:
         | 
| 4 | 
            -
                -  | 
| 5 | 
            -
                -  | 
| 6 | 
            -
                -  | 
|  | |
| 1 | 
             
            - command:
         | 
| 2 | 
            +
                - "python scripts/benchmark_one_model.py {{ model }} --result-root results/joule --dataset-path sharegpt4video/sharegpt4video_100.json --gpu-ids {{ gpu }} --batch-sizes 4 3 2 1 --power-limits 700 --num-inference-steps 1 2 4 8 16 25 30 40 50"
         | 
| 3 | 
             
              model:
         | 
| 4 | 
            +
                - "--model ali-vilab/i2vgen-xl --num-frames 16 --add-text-prompt --width 1280 --height 720"
         | 
| 5 | 
            +
                - "--model stabilityai/stable-video-diffusion-img2vid --num-frames 14 --width 1024 --height 576"
         | 
| 6 | 
            +
                - "--model stabilityai/stable-video-diffusion-img2vid-xt --num-frames 25 --width 1024 --height 576"
         | 
    	
        benchmark/diffusion/image-to-video/scripts/aggregate_leaderboard_models.py
    CHANGED
    
    | @@ -15,7 +15,7 @@ def main(results_dir: Path, output_file: Path) -> None: | |
| 15 | 
             
                for model_dir in sorted(glob(f"{results_dir}/*/*")):
         | 
| 16 | 
             
                    model_name = "/".join(model_dir.split("/")[-2:])
         | 
| 17 | 
             
                    print(f"  {model_name}")
         | 
| 18 | 
            -
                    result_file_cand = glob(f"{model_dir}/bs1+*+results.json")
         | 
| 19 | 
             
                    assert len(result_file_cand) == 1, model_name
         | 
| 20 | 
             
                    results_data = json.load(open(result_file_cand[0]))
         | 
| 21 | 
             
                    denosing_module_name = "unet" if "unet" in results_data["num_parameters"] else "transformer"
         | 
|  | |
| 15 | 
             
                for model_dir in sorted(glob(f"{results_dir}/*/*")):
         | 
| 16 | 
             
                    model_name = "/".join(model_dir.split("/")[-2:])
         | 
| 17 | 
             
                    print(f"  {model_name}")
         | 
| 18 | 
            +
                    result_file_cand = glob(f"{model_dir}/bs1+*+steps25+results.json")
         | 
| 19 | 
             
                    assert len(result_file_cand) == 1, model_name
         | 
| 20 | 
             
                    results_data = json.load(open(result_file_cand[0]))
         | 
| 21 | 
             
                    denosing_module_name = "unet" if "unet" in results_data["num_parameters"] else "transformer"
         | 
    	
        benchmark/diffusion/image-to-video/scripts/benchmark_one_datapoint.py
    CHANGED
    
    | @@ -27,10 +27,10 @@ class Results: | |
| 27 | 
             
                model: str
         | 
| 28 | 
             
                num_parameters: dict[str, int]
         | 
| 29 | 
             
                gpu_model: str
         | 
| 30 | 
            -
                num_infernece_steps: int
         | 
| 31 | 
            -
                num_frames: int
         | 
| 32 | 
             
                power_limit: int
         | 
| 33 | 
             
                batch_size: int
         | 
|  | |
|  | |
| 34 | 
             
                num_prompts: int
         | 
| 35 | 
             
                total_runtime: float = 0.0
         | 
| 36 | 
             
                total_energy: float = 0.0
         | 
| @@ -80,6 +80,7 @@ def load_text_image_prompts( | |
| 80 | 
             
                path: str,
         | 
| 81 | 
             
                batch_size: int,
         | 
| 82 | 
             
                num_batches: int | None = None,
         | 
|  | |
| 83 | 
             
            ) -> tuple[int, list[tuple[list[str], list[Image.Image]]]]:
         | 
| 84 | 
             
                """Load the dataset to feed the model and return it as a list of batches of prompts.
         | 
| 85 |  | 
| @@ -93,6 +94,9 @@ def load_text_image_prompts( | |
| 93 | 
             
                dataset = json.load(open(path))
         | 
| 94 | 
             
                assert len(dataset["caption"]) == len(dataset["video_id"])
         | 
| 95 |  | 
|  | |
|  | |
|  | |
| 96 | 
             
                if num_batches is not None:
         | 
| 97 | 
             
                    if len(dataset["caption"]) < num_batches * batch_size:
         | 
| 98 | 
             
                        raise ValueError("Not enough data for the requested number of batches.")
         | 
| @@ -103,6 +107,8 @@ def load_text_image_prompts( | |
| 103 | 
             
                dataset["first_frame"] = [
         | 
| 104 | 
             
                    load_image(str(image_path / f"{video_id}.jpg")) for video_id in dataset["video_id"]
         | 
| 105 | 
             
                ]
         | 
|  | |
|  | |
| 106 |  | 
| 107 | 
             
                batched = [
         | 
| 108 | 
             
                    (dataset["caption"][i : i + batch_size], dataset["first_frame"][i : i + batch_size])
         | 
| @@ -135,8 +141,8 @@ def benchmark(args: argparse.Namespace) -> None: | |
| 135 |  | 
| 136 | 
             
                results_dir = Path(args.result_root) / args.model
         | 
| 137 | 
             
                results_dir.mkdir(parents=True, exist_ok=True)
         | 
| 138 | 
            -
                benchmark_name = str(results_dir / f"bs{args.batch_size}+pl{args.power_limit}")
         | 
| 139 | 
            -
                video_dir = results_dir / f"bs{args.batch_size}+pl{args.power_limit}+generated"
         | 
| 140 | 
             
                video_dir.mkdir(exist_ok=True)
         | 
| 141 |  | 
| 142 | 
             
                arg_out_filename = f"{benchmark_name}+args.json"
         | 
| @@ -150,11 +156,16 @@ def benchmark(args: argparse.Namespace) -> None: | |
| 150 | 
             
                pynvml.nvmlInit()
         | 
| 151 | 
             
                handle = pynvml.nvmlDeviceGetHandleByIndex(0)
         | 
| 152 | 
             
                gpu_model = pynvml.nvmlDeviceGetName(handle)
         | 
| 153 | 
            -
                pynvml.nvmlDeviceSetPersistenceMode(handle, pynvml.NVML_FEATURE_ENABLED)
         | 
| 154 | 
            -
                pynvml.nvmlDeviceSetPowerManagementLimit(handle, args.power_limit * 1000)
         | 
| 155 | 
             
                pynvml.nvmlShutdown()
         | 
| 156 |  | 
| 157 | 
            -
                num_prompts, batched_prompts = load_text_image_prompts( | 
|  | |
|  | |
|  | |
|  | |
|  | |
| 158 |  | 
| 159 | 
             
                pipeline = get_pipeline(args.model)
         | 
| 160 |  | 
| @@ -189,7 +200,7 @@ def benchmark(args: argparse.Namespace) -> None: | |
| 189 | 
             
                fps_param_name = fps_param_name_candidates[0]
         | 
| 190 |  | 
| 191 | 
             
                torch.cuda.reset_peak_memory_stats(device="cuda:0")
         | 
| 192 | 
            -
                zeus_monitor.begin_window("benchmark",  | 
| 193 |  | 
| 194 | 
             
                # Build common parameter dict for all batches
         | 
| 195 | 
             
                params: dict[str, Any] = dict(
         | 
| @@ -210,15 +221,15 @@ def benchmark(args: argparse.Namespace) -> None: | |
| 210 | 
             
                    if args.add_text_prompt:
         | 
| 211 | 
             
                        params["prompt"] = intermediate.prompts
         | 
| 212 |  | 
| 213 | 
            -
                    zeus_monitor.begin_window("batch",  | 
| 214 | 
             
                    frames = pipeline(**params).frames
         | 
| 215 | 
            -
                    batch_measurements = zeus_monitor.end_window("batch",  | 
| 216 |  | 
| 217 | 
             
                    intermediate.frames = frames
         | 
| 218 | 
             
                    intermediate.batch_latency = batch_measurements.time
         | 
| 219 | 
             
                    intermediate.batch_energy = batch_measurements.total_energy
         | 
| 220 |  | 
| 221 | 
            -
                measurements = zeus_monitor.end_window("benchmark",  | 
| 222 | 
             
                peak_memory = torch.cuda.max_memory_allocated(device="cuda:0")
         | 
| 223 |  | 
| 224 | 
             
                results: list[Result] = []
         | 
| @@ -255,10 +266,10 @@ def benchmark(args: argparse.Namespace) -> None: | |
| 255 | 
             
                    model=args.model,
         | 
| 256 | 
             
                    num_parameters=count_parameters(pipeline),
         | 
| 257 | 
             
                    gpu_model=gpu_model,
         | 
| 258 | 
            -
                    num_infernece_steps=args.num_inference_steps,
         | 
| 259 | 
            -
                    num_frames=args.num_frames,
         | 
| 260 | 
             
                    power_limit=args.power_limit,
         | 
| 261 | 
             
                    batch_size=args.batch_size,
         | 
|  | |
|  | |
| 262 | 
             
                    num_prompts=num_prompts,
         | 
| 263 | 
             
                    total_runtime=measurements.time,
         | 
| 264 | 
             
                    total_energy=measurements.total_energy,
         | 
| @@ -289,8 +300,8 @@ if __name__ == "__main__": | |
| 289 | 
             
                parser.add_argument("--num-inference-steps", type=int, default=50, help="The number of denoising steps.")
         | 
| 290 | 
             
                parser.add_argument("--num-frames", type=int, default=1, help="The number of frames to generate.")
         | 
| 291 | 
             
                parser.add_argument("--fps", type=int, default=16, help="Frames per second for micro-conditioning.")
         | 
| 292 | 
            -
                parser.add_argument("--height", type=int, help="Height of the generated video.")
         | 
| 293 | 
            -
                parser.add_argument("--width", type=int, help="Width of the generated video.")
         | 
| 294 | 
             
                parser.add_argument("--num-batches", type=int, default=None, help="The number of batches to use from the dataset.")
         | 
| 295 | 
             
                parser.add_argument("--save-every", type=int, default=10, help="Save generations to file every N prompts.")
         | 
| 296 | 
             
                parser.add_argument("--seed", type=int, default=0, help="The seed to use for the RNG.")
         | 
|  | |
| 27 | 
             
                model: str
         | 
| 28 | 
             
                num_parameters: dict[str, int]
         | 
| 29 | 
             
                gpu_model: str
         | 
|  | |
|  | |
| 30 | 
             
                power_limit: int
         | 
| 31 | 
             
                batch_size: int
         | 
| 32 | 
            +
                num_inference_steps: int
         | 
| 33 | 
            +
                num_frames: int
         | 
| 34 | 
             
                num_prompts: int
         | 
| 35 | 
             
                total_runtime: float = 0.0
         | 
| 36 | 
             
                total_energy: float = 0.0
         | 
|  | |
| 80 | 
             
                path: str,
         | 
| 81 | 
             
                batch_size: int,
         | 
| 82 | 
             
                num_batches: int | None = None,
         | 
| 83 | 
            +
                image_resize: tuple[int, int] | None = None,
         | 
| 84 | 
             
            ) -> tuple[int, list[tuple[list[str], list[Image.Image]]]]:
         | 
| 85 | 
             
                """Load the dataset to feed the model and return it as a list of batches of prompts.
         | 
| 86 |  | 
|  | |
| 94 | 
             
                dataset = json.load(open(path))
         | 
| 95 | 
             
                assert len(dataset["caption"]) == len(dataset["video_id"])
         | 
| 96 |  | 
| 97 | 
            +
                dataset["caption"] *= 10
         | 
| 98 | 
            +
                dataset["video_id"] *= 10
         | 
| 99 | 
            +
             | 
| 100 | 
             
                if num_batches is not None:
         | 
| 101 | 
             
                    if len(dataset["caption"]) < num_batches * batch_size:
         | 
| 102 | 
             
                        raise ValueError("Not enough data for the requested number of batches.")
         | 
|  | |
| 107 | 
             
                dataset["first_frame"] = [
         | 
| 108 | 
             
                    load_image(str(image_path / f"{video_id}.jpg")) for video_id in dataset["video_id"]
         | 
| 109 | 
             
                ]
         | 
| 110 | 
            +
                if image_resize is not None:
         | 
| 111 | 
            +
                    dataset["first_frame"] = [image.resize(image_resize) for image in dataset["first_frame"]]
         | 
| 112 |  | 
| 113 | 
             
                batched = [
         | 
| 114 | 
             
                    (dataset["caption"][i : i + batch_size], dataset["first_frame"][i : i + batch_size])
         | 
|  | |
| 141 |  | 
| 142 | 
             
                results_dir = Path(args.result_root) / args.model
         | 
| 143 | 
             
                results_dir.mkdir(parents=True, exist_ok=True)
         | 
| 144 | 
            +
                benchmark_name = str(results_dir / f"bs{args.batch_size}+pl{args.power_limit}+steps{args.num_inference_steps}")
         | 
| 145 | 
            +
                video_dir = results_dir / f"bs{args.batch_size}+pl{args.power_limit}+steps{args.num_inference_steps}+generated"
         | 
| 146 | 
             
                video_dir.mkdir(exist_ok=True)
         | 
| 147 |  | 
| 148 | 
             
                arg_out_filename = f"{benchmark_name}+args.json"
         | 
|  | |
| 156 | 
             
                pynvml.nvmlInit()
         | 
| 157 | 
             
                handle = pynvml.nvmlDeviceGetHandleByIndex(0)
         | 
| 158 | 
             
                gpu_model = pynvml.nvmlDeviceGetName(handle)
         | 
| 159 | 
            +
                # pynvml.nvmlDeviceSetPersistenceMode(handle, pynvml.NVML_FEATURE_ENABLED)
         | 
| 160 | 
            +
                # pynvml.nvmlDeviceSetPowerManagementLimit(handle, args.power_limit * 1000)
         | 
| 161 | 
             
                pynvml.nvmlShutdown()
         | 
| 162 |  | 
| 163 | 
            +
                num_prompts, batched_prompts = load_text_image_prompts(
         | 
| 164 | 
            +
                    args.dataset_path,
         | 
| 165 | 
            +
                    args.batch_size,
         | 
| 166 | 
            +
                    args.num_batches,
         | 
| 167 | 
            +
                    (args.width, args.height),
         | 
| 168 | 
            +
                )
         | 
| 169 |  | 
| 170 | 
             
                pipeline = get_pipeline(args.model)
         | 
| 171 |  | 
|  | |
| 200 | 
             
                fps_param_name = fps_param_name_candidates[0]
         | 
| 201 |  | 
| 202 | 
             
                torch.cuda.reset_peak_memory_stats(device="cuda:0")
         | 
| 203 | 
            +
                zeus_monitor.begin_window("benchmark", sync_execution=False)
         | 
| 204 |  | 
| 205 | 
             
                # Build common parameter dict for all batches
         | 
| 206 | 
             
                params: dict[str, Any] = dict(
         | 
|  | |
| 221 | 
             
                    if args.add_text_prompt:
         | 
| 222 | 
             
                        params["prompt"] = intermediate.prompts
         | 
| 223 |  | 
| 224 | 
            +
                    zeus_monitor.begin_window("batch", sync_execution=False)
         | 
| 225 | 
             
                    frames = pipeline(**params).frames
         | 
| 226 | 
            +
                    batch_measurements = zeus_monitor.end_window("batch", sync_execution=False)
         | 
| 227 |  | 
| 228 | 
             
                    intermediate.frames = frames
         | 
| 229 | 
             
                    intermediate.batch_latency = batch_measurements.time
         | 
| 230 | 
             
                    intermediate.batch_energy = batch_measurements.total_energy
         | 
| 231 |  | 
| 232 | 
            +
                measurements = zeus_monitor.end_window("benchmark", sync_execution=False)
         | 
| 233 | 
             
                peak_memory = torch.cuda.max_memory_allocated(device="cuda:0")
         | 
| 234 |  | 
| 235 | 
             
                results: list[Result] = []
         | 
|  | |
| 266 | 
             
                    model=args.model,
         | 
| 267 | 
             
                    num_parameters=count_parameters(pipeline),
         | 
| 268 | 
             
                    gpu_model=gpu_model,
         | 
|  | |
|  | |
| 269 | 
             
                    power_limit=args.power_limit,
         | 
| 270 | 
             
                    batch_size=args.batch_size,
         | 
| 271 | 
            +
                    num_inference_steps=args.num_inference_steps,
         | 
| 272 | 
            +
                    num_frames=args.num_frames,
         | 
| 273 | 
             
                    num_prompts=num_prompts,
         | 
| 274 | 
             
                    total_runtime=measurements.time,
         | 
| 275 | 
             
                    total_energy=measurements.total_energy,
         | 
|  | |
| 300 | 
             
                parser.add_argument("--num-inference-steps", type=int, default=50, help="The number of denoising steps.")
         | 
| 301 | 
             
                parser.add_argument("--num-frames", type=int, default=1, help="The number of frames to generate.")
         | 
| 302 | 
             
                parser.add_argument("--fps", type=int, default=16, help="Frames per second for micro-conditioning.")
         | 
| 303 | 
            +
                parser.add_argument("--height", type=int, required=True, help="Height of the generated video.")
         | 
| 304 | 
            +
                parser.add_argument("--width", type=int, required=True, help="Width of the generated video.")
         | 
| 305 | 
             
                parser.add_argument("--num-batches", type=int, default=None, help="The number of batches to use from the dataset.")
         | 
| 306 | 
             
                parser.add_argument("--save-every", type=int, default=10, help="Save generations to file every N prompts.")
         | 
| 307 | 
             
                parser.add_argument("--seed", type=int, default=0, help="The seed to use for the RNG.")
         | 
    	
        benchmark/diffusion/image-to-video/scripts/benchmark_one_model.py
    CHANGED
    
    | @@ -28,44 +28,48 @@ def main(args: argparse.Namespace) -> None: | |
| 28 | 
             
                print_and_write(outfile, f"Benchmarking {args.model}\n")
         | 
| 29 | 
             
                print_and_write(outfile, f"Batch sizes: {args.batch_sizes}\n")
         | 
| 30 | 
             
                print_and_write(outfile, f"Power limits: {args.power_limits}\n")
         | 
|  | |
| 31 |  | 
| 32 | 
             
                for batch_size in args.batch_sizes:
         | 
| 33 | 
             
                    for power_limit in args.power_limits:
         | 
| 34 | 
            -
                         | 
| 35 | 
            -
             | 
| 36 | 
            -
                             | 
| 37 | 
            -
                                 | 
| 38 | 
            -
             | 
| 39 | 
            -
             | 
| 40 | 
            -
             | 
| 41 | 
            -
             | 
| 42 | 
            -
             | 
| 43 | 
            -
             | 
| 44 | 
            -
             | 
| 45 | 
            -
             | 
| 46 | 
            -
             | 
| 47 | 
            -
             | 
| 48 | 
            -
             | 
| 49 | 
            -
             | 
| 50 | 
            -
             | 
| 51 | 
            -
             | 
| 52 | 
            -
             | 
| 53 | 
            -
             | 
| 54 | 
            -
             | 
| 55 | 
            -
             | 
| 56 | 
            -
             | 
| 57 | 
            -
             | 
| 58 | 
            -
             | 
| 59 | 
            -
             | 
| 60 | 
            -
                                 | 
| 61 | 
            -
             | 
| 62 | 
            -
             | 
| 63 | 
            -
                                    i  | 
|  | |
|  | |
|  | |
| 64 |  | 
| 65 | 
            -
             | 
| 66 | 
            -
             | 
| 67 | 
            -
             | 
| 68 | 
            -
             | 
| 69 |  | 
| 70 |  | 
| 71 |  | 
| @@ -77,8 +81,10 @@ if __name__ == "__main__": | |
| 77 | 
             
                parser.add_argument("--batch-sizes", type=str, nargs="+", default=["8", "4", "2", "1"], help="Batch sizes to benchmark")
         | 
| 78 | 
             
                parser.add_argument("--power-limits", type=str, nargs="+", default=["400", "300", "200"], help="Power limits to benchmark")
         | 
| 79 | 
             
                parser.add_argument("--num-frames", type=str, help="Number of frames to generate")
         | 
| 80 | 
            -
                parser.add_argument("--num-inference-steps", type=str, help="Number of  | 
| 81 | 
             
                parser.add_argument("--add-text-prompt", action="store_true", help="Input text prompt alongside image.")
         | 
|  | |
|  | |
| 82 | 
             
                parser.add_argument("--dataset-path", type=str, help="Path to the dataset JSON file.")
         | 
| 83 | 
             
                args = parser.parse_args()
         | 
| 84 | 
             
                main(args)
         | 
|  | |
| 28 | 
             
                print_and_write(outfile, f"Benchmarking {args.model}\n")
         | 
| 29 | 
             
                print_and_write(outfile, f"Batch sizes: {args.batch_sizes}\n")
         | 
| 30 | 
             
                print_and_write(outfile, f"Power limits: {args.power_limits}\n")
         | 
| 31 | 
            +
                print_and_write(outfile, f"Number of inference steps: {args.num_inference_steps}\n")
         | 
| 32 |  | 
| 33 | 
             
                for batch_size in args.batch_sizes:
         | 
| 34 | 
             
                    for power_limit in args.power_limits:
         | 
| 35 | 
            +
                        for num_inference_steps in args.num_inference_steps:
         | 
| 36 | 
            +
                            print_and_write(outfile, f"{batch_size=}, {power_limit=}, {num_inference_steps=}\n", flush=True)
         | 
| 37 | 
            +
                            with subprocess.Popen(
         | 
| 38 | 
            +
                                args=[
         | 
| 39 | 
            +
                                    "docker", "run",
         | 
| 40 | 
            +
                                    "--gpus", '"device=' + ','.join(args.gpu_ids) + '"',
         | 
| 41 | 
            +
                                    "--cap-add", "SYS_ADMIN",
         | 
| 42 | 
            +
                                    "--name", f"leaderboard-i2v-{''.join(args.gpu_ids)}",
         | 
| 43 | 
            +
                                    "--rm",
         | 
| 44 | 
            +
                                    "-v", "/data/leaderboard/hfcache:/root/.cache/huggingface",
         | 
| 45 | 
            +
                                    "-v", f"{os.getcwd()}:/workspace/image-to-video",
         | 
| 46 | 
            +
                                    "mlenergy/leaderboard:diffusion-i2v",
         | 
| 47 | 
            +
                                    "--dataset-path", args.dataset_path,
         | 
| 48 | 
            +
                                    "--result-root", args.result_root,
         | 
| 49 | 
            +
                                    "--batch-size", batch_size,
         | 
| 50 | 
            +
                                    "--num-batches", "8",
         | 
| 51 | 
            +
                                    "--power-limit", power_limit,
         | 
| 52 | 
            +
                                    "--model", args.model,
         | 
| 53 | 
            +
                                    "--huggingface-token", hf_token,
         | 
| 54 | 
            +
                                    "--num-frames", args.num_frames,
         | 
| 55 | 
            +
                                    "--num-inference-steps", num_inference_steps,
         | 
| 56 | 
            +
                                    "--width", str(args.width),
         | 
| 57 | 
            +
                                    "--height", str(args.height),
         | 
| 58 | 
            +
                                ] + (["--add-text-prompt"] if args.add_text_prompt else []),
         | 
| 59 | 
            +
                                stdout=subprocess.PIPE,
         | 
| 60 | 
            +
                                stderr=subprocess.STDOUT,
         | 
| 61 | 
            +
                                text=True,
         | 
| 62 | 
            +
                            ) as proc:
         | 
| 63 | 
            +
                                if proc.stdout:
         | 
| 64 | 
            +
                                    i = 0
         | 
| 65 | 
            +
                                    for line in proc.stdout:
         | 
| 66 | 
            +
                                        print_and_write(outfile, line, flush=i % 50 == 0)
         | 
| 67 | 
            +
                                        i += 1
         | 
| 68 |  | 
| 69 | 
            +
                            # If proc exited with non-zero status, it's probably an OOM.
         | 
| 70 | 
            +
                            # Move on to the next batch size.
         | 
| 71 | 
            +
                            if proc.returncode != 0:
         | 
| 72 | 
            +
                                break
         | 
| 73 |  | 
| 74 |  | 
| 75 |  | 
|  | |
| 81 | 
             
                parser.add_argument("--batch-sizes", type=str, nargs="+", default=["8", "4", "2", "1"], help="Batch sizes to benchmark")
         | 
| 82 | 
             
                parser.add_argument("--power-limits", type=str, nargs="+", default=["400", "300", "200"], help="Power limits to benchmark")
         | 
| 83 | 
             
                parser.add_argument("--num-frames", type=str, help="Number of frames to generate")
         | 
| 84 | 
            +
                parser.add_argument("--num-inference-steps", type=str, nargs="+", default=["1", "2", "4", "8", "16", "30", "40", "50"], help="Number of inference steps to run")
         | 
| 85 | 
             
                parser.add_argument("--add-text-prompt", action="store_true", help="Input text prompt alongside image.")
         | 
| 86 | 
            +
                parser.add_argument("--height", type=int, required=True, help="Height of the generated video.")
         | 
| 87 | 
            +
                parser.add_argument("--width", type=int, required=True, help="Width of the generated video.")
         | 
| 88 | 
             
                parser.add_argument("--dataset-path", type=str, help="Path to the dataset JSON file.")
         | 
| 89 | 
             
                args = parser.parse_args()
         | 
| 90 | 
             
                main(args)
         | 
    	
        benchmark/diffusion/image-to-video/sharegpt4video/extract_first_frame.py
    CHANGED
    
    | @@ -3,7 +3,7 @@ import json | |
| 3 |  | 
| 4 | 
             
            import cv2
         | 
| 5 |  | 
| 6 | 
            -
            DATASET_PATH = " | 
| 7 |  | 
| 8 |  | 
| 9 | 
             
            def main() -> None:
         | 
|  | |
| 3 |  | 
| 4 | 
             
            import cv2
         | 
| 5 |  | 
| 6 | 
            +
            DATASET_PATH = "sharegpt4video_100.json"
         | 
| 7 |  | 
| 8 |  | 
| 9 | 
             
            def main() -> None:
         | 
    	
        benchmark/diffusion/text-to-image/pegasus/A100/queue_1gpu.yaml
    CHANGED
    
    | @@ -1,5 +1,5 @@ | |
| 1 | 
             
            - command:
         | 
| 2 | 
            -
                - "python scripts/benchmark_one_model.py --model {{ model }} --result-root results/joule --gpu-ids {{ gpu }} --batch-sizes 16 8 4 2 1 --power-limits 400"
         | 
| 3 | 
             
              model:
         | 
| 4 | 
             
                - stabilityai/stable-diffusion-2-1
         | 
| 5 | 
             
                - stabilityai/stable-diffusion-xl-base-1.0
         | 
|  | |
| 1 | 
             
            - command:
         | 
| 2 | 
            +
                - "python scripts/benchmark_one_model.py --model {{ model }} --result-root results/joule --gpu-ids {{ gpu }} --batch-sizes 64 32 16 8 4 2 1 --num-inference-steps 1 2 4 8 16 25 30 40 50 --power-limits 400"
         | 
| 3 | 
             
              model:
         | 
| 4 | 
             
                - stabilityai/stable-diffusion-2-1
         | 
| 5 | 
             
                - stabilityai/stable-diffusion-xl-base-1.0
         | 
    	
        benchmark/diffusion/text-to-image/scripts/aggregate_leaderboard_models.py
    CHANGED
    
    | @@ -15,7 +15,7 @@ def main(results_dir: Path, output_file: Path) -> None: | |
| 15 | 
             
                for model_dir in sorted(glob(f"{results_dir}/*/*")):
         | 
| 16 | 
             
                    model_name = "/".join(model_dir.split("/")[-2:])
         | 
| 17 | 
             
                    print(f"  {model_name}")
         | 
| 18 | 
            -
                    result_file_cand = glob(f"{model_dir}/bs1+*+results.json")
         | 
| 19 | 
             
                    assert len(result_file_cand) == 1, model_name
         | 
| 20 | 
             
                    results_data = json.load(open(result_file_cand[0]))
         | 
| 21 | 
             
                    denosing_module_name = "unet" if "unet" in results_data["num_parameters"] else "transformer"
         | 
| @@ -24,6 +24,7 @@ def main(results_dir: Path, output_file: Path) -> None: | |
| 24 | 
             
                        nickname=model_name.split("/")[-1].replace("-", " ").title(),
         | 
| 25 | 
             
                        total_params=raw_params_to_readable(sum(results_data["num_parameters"].values())),
         | 
| 26 | 
             
                        denoising_params=raw_params_to_readable(results_data["num_parameters"][denosing_module_name]),
         | 
|  | |
| 27 | 
             
                    )
         | 
| 28 | 
             
                    assert model_name not in models
         | 
| 29 | 
             
                    models[model_name] = model_info
         | 
|  | |
| 15 | 
             
                for model_dir in sorted(glob(f"{results_dir}/*/*")):
         | 
| 16 | 
             
                    model_name = "/".join(model_dir.split("/")[-2:])
         | 
| 17 | 
             
                    print(f"  {model_name}")
         | 
| 18 | 
            +
                    result_file_cand = glob(f"{model_dir}/bs1+*+steps25+results.json")
         | 
| 19 | 
             
                    assert len(result_file_cand) == 1, model_name
         | 
| 20 | 
             
                    results_data = json.load(open(result_file_cand[0]))
         | 
| 21 | 
             
                    denosing_module_name = "unet" if "unet" in results_data["num_parameters"] else "transformer"
         | 
|  | |
| 24 | 
             
                        nickname=model_name.split("/")[-1].replace("-", " ").title(),
         | 
| 25 | 
             
                        total_params=raw_params_to_readable(sum(results_data["num_parameters"].values())),
         | 
| 26 | 
             
                        denoising_params=raw_params_to_readable(results_data["num_parameters"][denosing_module_name]),
         | 
| 27 | 
            +
                        resolution="NA",
         | 
| 28 | 
             
                    )
         | 
| 29 | 
             
                    assert model_name not in models
         | 
| 30 | 
             
                    models[model_name] = model_info
         | 
    	
        benchmark/diffusion/text-to-image/scripts/benchmark_one_datapoint.py
    CHANGED
    
    | @@ -1,8 +1,10 @@ | |
| 1 | 
             
            from __future__ import annotations
         | 
| 2 |  | 
| 3 | 
             
            import os
         | 
|  | |
| 4 | 
             
            import json
         | 
| 5 | 
             
            import argparse
         | 
|  | |
| 6 | 
             
            from pprint import pprint
         | 
| 7 | 
             
            from pathlib import Path
         | 
| 8 | 
             
            from contextlib import suppress
         | 
| @@ -11,6 +13,7 @@ from dataclasses import dataclass, field, asdict | |
| 11 | 
             
            import torch
         | 
| 12 | 
             
            import pynvml
         | 
| 13 | 
             
            import numpy as np
         | 
|  | |
| 14 | 
             
            from PIL import Image
         | 
| 15 | 
             
            from datasets import load_dataset, Dataset
         | 
| 16 | 
             
            from transformers.trainer_utils import set_seed
         | 
| @@ -35,9 +38,9 @@ class Results: | |
| 35 | 
             
                model: str
         | 
| 36 | 
             
                num_parameters: dict[str, int]
         | 
| 37 | 
             
                gpu_model: str
         | 
| 38 | 
            -
                num_inference_steps: int
         | 
| 39 | 
             
                power_limit: int
         | 
| 40 | 
             
                batch_size: int
         | 
|  | |
| 41 | 
             
                num_prompts: int
         | 
| 42 | 
             
                average_clip_score: float = 0.0
         | 
| 43 | 
             
                total_runtime: float = 0.0
         | 
| @@ -118,6 +121,28 @@ def load_partiprompts( | |
| 118 | 
             
                return len(batched) * batch_size, batched
         | 
| 119 |  | 
| 120 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 121 | 
             
            def calculate_clip_score(
         | 
| 122 | 
             
                model: CLIPModel,
         | 
| 123 | 
             
                processor: CLIPProcessor,
         | 
| @@ -183,8 +208,8 @@ def benchmark(args: argparse.Namespace) -> None: | |
| 183 |  | 
| 184 | 
             
                results_dir = Path(args.result_root) / args.model
         | 
| 185 | 
             
                results_dir.mkdir(parents=True, exist_ok=True)
         | 
| 186 | 
            -
                benchmark_name = str(results_dir / f"bs{args.batch_size}+pl{args.power_limit}")
         | 
| 187 | 
            -
                image_dir = results_dir / f"bs{args.batch_size}+pl{args.power_limit}+generated"
         | 
| 188 | 
             
                image_dir.mkdir(exist_ok=True)
         | 
| 189 |  | 
| 190 | 
             
                arg_out_filename = f"{benchmark_name}+args.json"
         | 
| @@ -222,27 +247,42 @@ def benchmark(args: argparse.Namespace) -> None: | |
| 222 | 
             
                    ResultIntermediateBatched(prompts=batch) for batch in batched_prompts
         | 
| 223 | 
             
                ]
         | 
| 224 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 225 | 
             
                torch.cuda.reset_peak_memory_stats(device="cuda:0")
         | 
| 226 | 
            -
                zeus_monitor.begin_window("benchmark",  | 
| 227 |  | 
| 228 | 
             
                for ind, intermediate in enumerate(intermediates):
         | 
| 229 | 
             
                    print(f"Batch {ind + 1}/{len(intermediates)}")
         | 
| 230 | 
            -
                    zeus_monitor.begin_window("batch",  | 
| 231 | 
             
                    images = pipeline(
         | 
| 232 | 
             
                        intermediate.prompts,
         | 
| 233 | 
             
                        generator=rng,
         | 
| 234 | 
             
                        num_inference_steps=args.num_inference_steps,
         | 
| 235 | 
             
                        output_type="np",
         | 
| 236 | 
             
                    ).images
         | 
| 237 | 
            -
                    batch_measurements = zeus_monitor.end_window("batch",  | 
| 238 |  | 
| 239 | 
             
                    intermediate.images = images
         | 
| 240 | 
             
                    intermediate.batch_latency = batch_measurements.time
         | 
| 241 | 
             
                    intermediate.batch_energy = batch_measurements.total_energy
         | 
| 242 |  | 
| 243 | 
            -
                measurements = zeus_monitor.end_window("benchmark",  | 
| 244 | 
             
                peak_memory = torch.cuda.max_memory_allocated(device="cuda:0")
         | 
| 245 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
| 246 | 
             
                # Scale images to [0, 256] and convert to uint8
         | 
| 247 | 
             
                for intermediate in intermediates:
         | 
| 248 | 
             
                    intermediate.images = (intermediate.images * 255).astype("uint8")
         | 
| @@ -292,9 +332,9 @@ def benchmark(args: argparse.Namespace) -> None: | |
| 292 | 
             
                    model=args.model,
         | 
| 293 | 
             
                    num_parameters=count_parameters(pipeline),
         | 
| 294 | 
             
                    gpu_model=gpu_model,
         | 
| 295 | 
            -
                    num_inference_steps=args.num_inference_steps,
         | 
| 296 | 
             
                    power_limit=args.power_limit,
         | 
| 297 | 
             
                    batch_size=args.batch_size,
         | 
|  | |
| 298 | 
             
                    num_prompts=num_prompts,
         | 
| 299 | 
             
                    average_clip_score=sum(r.clip_score for r in results) / len(results),
         | 
| 300 | 
             
                    total_runtime=measurements.time,
         | 
| @@ -326,6 +366,7 @@ if __name__ == "__main__": | |
| 326 | 
             
                parser.add_argument("--image-save-every", type=int, default=10, help="Save images to file every N prompts.")
         | 
| 327 | 
             
                parser.add_argument("--seed", type=int, default=0, help="The seed to use for the RNG.")
         | 
| 328 | 
             
                parser.add_argument("--huggingface-token", type=str, help="The HuggingFace token to use.")
         | 
|  | |
| 329 | 
             
                args = parser.parse_args()
         | 
| 330 |  | 
| 331 | 
             
                benchmark(args)
         | 
|  | |
| 1 | 
             
            from __future__ import annotations
         | 
| 2 |  | 
| 3 | 
             
            import os
         | 
| 4 | 
            +
            import time
         | 
| 5 | 
             
            import json
         | 
| 6 | 
             
            import argparse
         | 
| 7 | 
            +
            import multiprocessing as mp
         | 
| 8 | 
             
            from pprint import pprint
         | 
| 9 | 
             
            from pathlib import Path
         | 
| 10 | 
             
            from contextlib import suppress
         | 
|  | |
| 13 | 
             
            import torch
         | 
| 14 | 
             
            import pynvml
         | 
| 15 | 
             
            import numpy as np
         | 
| 16 | 
            +
            import pandas as pd
         | 
| 17 | 
             
            from PIL import Image
         | 
| 18 | 
             
            from datasets import load_dataset, Dataset
         | 
| 19 | 
             
            from transformers.trainer_utils import set_seed
         | 
|  | |
| 38 | 
             
                model: str
         | 
| 39 | 
             
                num_parameters: dict[str, int]
         | 
| 40 | 
             
                gpu_model: str
         | 
|  | |
| 41 | 
             
                power_limit: int
         | 
| 42 | 
             
                batch_size: int
         | 
| 43 | 
            +
                num_inference_steps: int
         | 
| 44 | 
             
                num_prompts: int
         | 
| 45 | 
             
                average_clip_score: float = 0.0
         | 
| 46 | 
             
                total_runtime: float = 0.0
         | 
|  | |
| 121 | 
             
                return len(batched) * batch_size, batched
         | 
| 122 |  | 
| 123 |  | 
| 124 | 
            +
            def power_monitor(csv_path: str, gpu_indices: list[int], chan: mp.SimpleQueue) -> None:
         | 
| 125 | 
            +
                pynvml.nvmlInit()
         | 
| 126 | 
            +
                handles = [pynvml.nvmlDeviceGetHandleByIndex(i) for i in gpu_indices]
         | 
| 127 | 
            +
             | 
| 128 | 
            +
                fields = [
         | 
| 129 | 
            +
                    (pynvml.NVML_FI_DEV_POWER_AVERAGE, pynvml.NVML_POWER_SCOPE_GPU),
         | 
| 130 | 
            +
                    (pynvml.NVML_FI_DEV_POWER_AVERAGE, pynvml.NVML_POWER_SCOPE_MEMORY),
         | 
| 131 | 
            +
                ]
         | 
| 132 | 
            +
             | 
| 133 | 
            +
                columns = ["timestamp"] + sum([[f"gpu{i}", f"vram{i}"] for i in gpu_indices], [])
         | 
| 134 | 
            +
                power: list[list] = []
         | 
| 135 | 
            +
                while chan.empty():
         | 
| 136 | 
            +
                    row = [time.monotonic()]
         | 
| 137 | 
            +
                    values = [pynvml.nvmlDeviceGetFieldValues(h, fields) for h in handles]
         | 
| 138 | 
            +
                    for value in values:
         | 
| 139 | 
            +
                        row.extend((value[0].value.uiVal, value[1].value.uiVal))
         | 
| 140 | 
            +
                    power.append(row)
         | 
| 141 | 
            +
                    time.sleep(max(0.0, 0.1 - (time.monotonic() - row[0])))
         | 
| 142 | 
            +
             | 
| 143 | 
            +
                pd.DataFrame(power, columns=columns).to_csv(csv_path, index=False)
         | 
| 144 | 
            +
             | 
| 145 | 
            +
             | 
| 146 | 
             
            def calculate_clip_score(
         | 
| 147 | 
             
                model: CLIPModel,
         | 
| 148 | 
             
                processor: CLIPProcessor,
         | 
|  | |
| 208 |  | 
| 209 | 
             
                results_dir = Path(args.result_root) / args.model
         | 
| 210 | 
             
                results_dir.mkdir(parents=True, exist_ok=True)
         | 
| 211 | 
            +
                benchmark_name = str(results_dir / f"bs{args.batch_size}+pl{args.power_limit}+steps{args.num_inference_steps}")
         | 
| 212 | 
            +
                image_dir = results_dir / f"bs{args.batch_size}+pl{args.power_limit}+steps{args.num_inference_steps}+generated"
         | 
| 213 | 
             
                image_dir.mkdir(exist_ok=True)
         | 
| 214 |  | 
| 215 | 
             
                arg_out_filename = f"{benchmark_name}+args.json"
         | 
|  | |
| 247 | 
             
                    ResultIntermediateBatched(prompts=batch) for batch in batched_prompts
         | 
| 248 | 
             
                ]
         | 
| 249 |  | 
| 250 | 
            +
                pmon = None
         | 
| 251 | 
            +
                pmon_chan = None
         | 
| 252 | 
            +
                if args.monitor_power:
         | 
| 253 | 
            +
                    pmon_chan = mp.SimpleQueue()
         | 
| 254 | 
            +
                    pmon = mp.get_context("spawn").Process(
         | 
| 255 | 
            +
                        target=power_monitor,
         | 
| 256 | 
            +
                        args=(f"{benchmark_name}+power.csv", [g.gpu_index for g in zeus_monitor.gpus.gpus], pmon_chan),
         | 
| 257 | 
            +
                    )
         | 
| 258 | 
            +
                    pmon.start()
         | 
| 259 | 
            +
             | 
| 260 | 
             
                torch.cuda.reset_peak_memory_stats(device="cuda:0")
         | 
| 261 | 
            +
                zeus_monitor.begin_window("benchmark", sync_execution=False)
         | 
| 262 |  | 
| 263 | 
             
                for ind, intermediate in enumerate(intermediates):
         | 
| 264 | 
             
                    print(f"Batch {ind + 1}/{len(intermediates)}")
         | 
| 265 | 
            +
                    zeus_monitor.begin_window("batch", sync_execution=False)
         | 
| 266 | 
             
                    images = pipeline(
         | 
| 267 | 
             
                        intermediate.prompts,
         | 
| 268 | 
             
                        generator=rng,
         | 
| 269 | 
             
                        num_inference_steps=args.num_inference_steps,
         | 
| 270 | 
             
                        output_type="np",
         | 
| 271 | 
             
                    ).images
         | 
| 272 | 
            +
                    batch_measurements = zeus_monitor.end_window("batch", sync_execution=False)
         | 
| 273 |  | 
| 274 | 
             
                    intermediate.images = images
         | 
| 275 | 
             
                    intermediate.batch_latency = batch_measurements.time
         | 
| 276 | 
             
                    intermediate.batch_energy = batch_measurements.total_energy
         | 
| 277 |  | 
| 278 | 
            +
                measurements = zeus_monitor.end_window("benchmark", sync_execution=False)
         | 
| 279 | 
             
                peak_memory = torch.cuda.max_memory_allocated(device="cuda:0")
         | 
| 280 |  | 
| 281 | 
            +
                if pmon is not None and pmon_chan is not None:
         | 
| 282 | 
            +
                    pmon_chan.put("stop")
         | 
| 283 | 
            +
                    pmon.join(timeout=5.0)
         | 
| 284 | 
            +
                    pmon.terminate()
         | 
| 285 | 
            +
             | 
| 286 | 
             
                # Scale images to [0, 256] and convert to uint8
         | 
| 287 | 
             
                for intermediate in intermediates:
         | 
| 288 | 
             
                    intermediate.images = (intermediate.images * 255).astype("uint8")
         | 
|  | |
| 332 | 
             
                    model=args.model,
         | 
| 333 | 
             
                    num_parameters=count_parameters(pipeline),
         | 
| 334 | 
             
                    gpu_model=gpu_model,
         | 
|  | |
| 335 | 
             
                    power_limit=args.power_limit,
         | 
| 336 | 
             
                    batch_size=args.batch_size,
         | 
| 337 | 
            +
                    num_inference_steps=args.num_inference_steps,
         | 
| 338 | 
             
                    num_prompts=num_prompts,
         | 
| 339 | 
             
                    average_clip_score=sum(r.clip_score for r in results) / len(results),
         | 
| 340 | 
             
                    total_runtime=measurements.time,
         | 
|  | |
| 366 | 
             
                parser.add_argument("--image-save-every", type=int, default=10, help="Save images to file every N prompts.")
         | 
| 367 | 
             
                parser.add_argument("--seed", type=int, default=0, help="The seed to use for the RNG.")
         | 
| 368 | 
             
                parser.add_argument("--huggingface-token", type=str, help="The HuggingFace token to use.")
         | 
| 369 | 
            +
                parser.add_argument("--monitor-power", default=False, action="store_true", help="Whether to monitor power over time.")
         | 
| 370 | 
             
                args = parser.parse_args()
         | 
| 371 |  | 
| 372 | 
             
                benchmark(args)
         | 
    	
        benchmark/diffusion/text-to-image/scripts/benchmark_one_model.py
    CHANGED
    
    | @@ -28,12 +28,13 @@ def main(args: argparse.Namespace) -> None: | |
| 28 | 
             
                print_and_write(outfile, f"Benchmarking {args.model}\n")
         | 
| 29 | 
             
                print_and_write(outfile, f"Batch sizes: {args.batch_sizes}\n")
         | 
| 30 | 
             
                print_and_write(outfile, f"Power limits: {args.power_limits}\n")
         | 
|  | |
| 31 |  | 
| 32 | 
             
                for batch_size in args.batch_sizes:
         | 
| 33 | 
             
                    for power_limit in args.power_limits:
         | 
| 34 | 
            -
                         | 
| 35 | 
            -
             | 
| 36 | 
            -
                             | 
| 37 | 
             
                                "docker", "run",
         | 
| 38 | 
             
                                "--gpus", '"device=' + ','.join(args.gpu_ids) + '"',
         | 
| 39 | 
             
                                "--cap-add", "SYS_ADMIN",
         | 
| @@ -48,22 +49,21 @@ def main(args: argparse.Namespace) -> None: | |
| 48 | 
             
                                "--power-limit", power_limit,
         | 
| 49 | 
             
                                "--model", args.model,
         | 
| 50 | 
             
                                "--huggingface-token", hf_token,
         | 
| 51 | 
            -
                                "--num-inference-steps",  | 
| 52 | 
            -
                            ] | 
| 53 | 
            -
                             | 
| 54 | 
            -
             | 
| 55 | 
            -
                            text=True | 
| 56 | 
            -
             | 
| 57 | 
            -
             | 
| 58 | 
            -
             | 
| 59 | 
            -
             | 
| 60 | 
            -
             | 
| 61 | 
            -
                                    i += 1
         | 
| 62 |  | 
| 63 | 
            -
             | 
| 64 | 
            -
             | 
| 65 | 
            -
             | 
| 66 | 
            -
             | 
| 67 |  | 
| 68 |  | 
| 69 |  | 
| @@ -74,5 +74,7 @@ if __name__ == "__main__": | |
| 74 | 
             
                parser.add_argument("--gpu-ids", type=str, nargs="+", help="GPU IDs to use")
         | 
| 75 | 
             
                parser.add_argument("--batch-sizes", type=str, nargs="+", default=["8", "4", "2", "1"], help="Batch sizes to benchmark")
         | 
| 76 | 
             
                parser.add_argument("--power-limits", type=str, nargs="+", default=["400", "300", "200"], help="Power limits to benchmark")
         | 
|  | |
|  | |
| 77 | 
             
                args = parser.parse_args()
         | 
| 78 | 
             
                main(args)
         | 
|  | |
| 28 | 
             
                print_and_write(outfile, f"Benchmarking {args.model}\n")
         | 
| 29 | 
             
                print_and_write(outfile, f"Batch sizes: {args.batch_sizes}\n")
         | 
| 30 | 
             
                print_and_write(outfile, f"Power limits: {args.power_limits}\n")
         | 
| 31 | 
            +
                print_and_write(outfile, f"Number of inference steps: {args.num_inference_steps}\n")
         | 
| 32 |  | 
| 33 | 
             
                for batch_size in args.batch_sizes:
         | 
| 34 | 
             
                    for power_limit in args.power_limits:
         | 
| 35 | 
            +
                        for num_inference_steps in args.num_inference_steps:
         | 
| 36 | 
            +
                            print_and_write(outfile, f"{batch_size=}, {power_limit=}, {num_inference_steps=}\n", flush=True)
         | 
| 37 | 
            +
                            cmd=[
         | 
| 38 | 
             
                                "docker", "run",
         | 
| 39 | 
             
                                "--gpus", '"device=' + ','.join(args.gpu_ids) + '"',
         | 
| 40 | 
             
                                "--cap-add", "SYS_ADMIN",
         | 
|  | |
| 49 | 
             
                                "--power-limit", power_limit,
         | 
| 50 | 
             
                                "--model", args.model,
         | 
| 51 | 
             
                                "--huggingface-token", hf_token,
         | 
| 52 | 
            +
                                "--num-inference-steps", num_inference_steps,
         | 
| 53 | 
            +
                            ]
         | 
| 54 | 
            +
                            if args.monitor_power:
         | 
| 55 | 
            +
                                cmd.append("--monitor-power")
         | 
| 56 | 
            +
                            with subprocess.Popen(args=cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) as proc:
         | 
| 57 | 
            +
                                if proc.stdout:
         | 
| 58 | 
            +
                                    i = 0
         | 
| 59 | 
            +
                                    for line in proc.stdout:
         | 
| 60 | 
            +
                                        print_and_write(outfile, line, flush=i % 50 == 0)
         | 
| 61 | 
            +
                                        i += 1
         | 
|  | |
| 62 |  | 
| 63 | 
            +
                            # If proc exited with non-zero status, it's probably an OOM.
         | 
| 64 | 
            +
                            # Move on to the next batch size.
         | 
| 65 | 
            +
                            if proc.returncode != 0:
         | 
| 66 | 
            +
                                break
         | 
| 67 |  | 
| 68 |  | 
| 69 |  | 
|  | |
| 74 | 
             
                parser.add_argument("--gpu-ids", type=str, nargs="+", help="GPU IDs to use")
         | 
| 75 | 
             
                parser.add_argument("--batch-sizes", type=str, nargs="+", default=["8", "4", "2", "1"], help="Batch sizes to benchmark")
         | 
| 76 | 
             
                parser.add_argument("--power-limits", type=str, nargs="+", default=["400", "300", "200"], help="Power limits to benchmark")
         | 
| 77 | 
            +
                parser.add_argument("--num-inference-steps", type=str, nargs="+", default=["1", "2", "4", "8", "16", "25", "30", "40", "50"], help="Number of inference steps to run")
         | 
| 78 | 
            +
                parser.add_argument("--monitor-power", default=False, action="store_true", help="Whether to monitor power over time.")
         | 
| 79 | 
             
                args = parser.parse_args()
         | 
| 80 | 
             
                main(args)
         | 
    	
        benchmark/diffusion/text-to-video/pegasus/A100/queue_1gpu.yaml
    CHANGED
    
    | @@ -1,5 +1,5 @@ | |
| 1 | 
             
            - command:
         | 
| 2 | 
            -
                - "python scripts/benchmark_one_model.py --model {{ model }} --result-root results/joule --dataset-path sharegpt4video/sharegpt4video_100.json --gpu-ids {{ gpu }} --batch-sizes 16 8 4 2 1 --power-limits 400 --num-inference-steps 25 --num-frames 16"
         | 
| 3 | 
             
              model:
         | 
| 4 | 
             
                - ali-vilab/text-to-video-ms-1.7b
         | 
| 5 | 
             
                - guoyww/animatediff-motion-adapter-v1-5-3
         | 
|  | |
| 1 | 
             
            - command:
         | 
| 2 | 
            +
                - "python scripts/benchmark_one_model.py --model {{ model }} --result-root results/joule --dataset-path sharegpt4video/sharegpt4video_100.json --gpu-ids {{ gpu }} --batch-sizes 32 16 8 4 2 1 --power-limits 400 --num-inference-steps 1 2 4 8 16 25 30 40 50 --num-frames 16"
         | 
| 3 | 
             
              model:
         | 
| 4 | 
             
                - ali-vilab/text-to-video-ms-1.7b
         | 
| 5 | 
             
                - guoyww/animatediff-motion-adapter-v1-5-3
         | 
    	
        benchmark/diffusion/text-to-video/pegasus/H100/queue_1gpu.yaml
    CHANGED
    
    | @@ -1,5 +1,5 @@ | |
| 1 | 
             
            - command:
         | 
| 2 | 
            -
                - "python scripts/benchmark_one_model.py --model {{ model }} --result-root results/joule --dataset-path sharegpt4video/ | 
| 3 | 
             
              model:
         | 
| 4 | 
             
                - ali-vilab/text-to-video-ms-1.7b
         | 
| 5 | 
             
                - guoyww/animatediff-motion-adapter-v1-5-3
         | 
|  | |
| 1 | 
             
            - command:
         | 
| 2 | 
            +
                - "python scripts/benchmark_one_model.py --model {{ model }} --result-root results/joule --dataset-path sharegpt4video/sharegpt4video_100.json --gpu-ids {{ gpu }} --batch-sizes 32 16 8 4 2 1 --power-limits 700 --num-inference-steps 1 2 4 8 16 25 30 40 50 --num-frames 16"
         | 
| 3 | 
             
              model:
         | 
| 4 | 
             
                - ali-vilab/text-to-video-ms-1.7b
         | 
| 5 | 
             
                - guoyww/animatediff-motion-adapter-v1-5-3
         | 
    	
        benchmark/diffusion/text-to-video/scripts/aggregate_leaderboard_models.py
    CHANGED
    
    | @@ -15,7 +15,7 @@ def main(results_dir: Path, output_file: Path) -> None: | |
| 15 | 
             
                for model_dir in sorted(glob(f"{results_dir}/*/*")):
         | 
| 16 | 
             
                    model_name = "/".join(model_dir.split("/")[-2:])
         | 
| 17 | 
             
                    print(f"  {model_name}")
         | 
| 18 | 
            -
                    result_file_cand = glob(f"{model_dir}/bs1+*+results.json")
         | 
| 19 | 
             
                    assert len(result_file_cand) == 1, model_name
         | 
| 20 | 
             
                    results_data = json.load(open(result_file_cand[0]))
         | 
| 21 | 
             
                    denosing_module_name = "unet" if "unet" in results_data["num_parameters"] else "transformer"
         | 
| @@ -24,6 +24,7 @@ def main(results_dir: Path, output_file: Path) -> None: | |
| 24 | 
             
                        nickname=model_name.split("/")[-1].replace("-", " ").title(),
         | 
| 25 | 
             
                        total_params=raw_params_to_readable(sum(results_data["num_parameters"].values())),
         | 
| 26 | 
             
                        denoising_params=raw_params_to_readable(results_data["num_parameters"][denosing_module_name]),
         | 
|  | |
| 27 | 
             
                    )
         | 
| 28 | 
             
                    assert model_name not in models
         | 
| 29 | 
             
                    models[model_name] = model_info
         | 
|  | |
| 15 | 
             
                for model_dir in sorted(glob(f"{results_dir}/*/*")):
         | 
| 16 | 
             
                    model_name = "/".join(model_dir.split("/")[-2:])
         | 
| 17 | 
             
                    print(f"  {model_name}")
         | 
| 18 | 
            +
                    result_file_cand = glob(f"{model_dir}/bs1+*+steps25+results.json")
         | 
| 19 | 
             
                    assert len(result_file_cand) == 1, model_name
         | 
| 20 | 
             
                    results_data = json.load(open(result_file_cand[0]))
         | 
| 21 | 
             
                    denosing_module_name = "unet" if "unet" in results_data["num_parameters"] else "transformer"
         | 
|  | |
| 24 | 
             
                        nickname=model_name.split("/")[-1].replace("-", " ").title(),
         | 
| 25 | 
             
                        total_params=raw_params_to_readable(sum(results_data["num_parameters"].values())),
         | 
| 26 | 
             
                        denoising_params=raw_params_to_readable(results_data["num_parameters"][denosing_module_name]),
         | 
| 27 | 
            +
                        resolution="NA",
         | 
| 28 | 
             
                    )
         | 
| 29 | 
             
                    assert model_name not in models
         | 
| 30 | 
             
                    models[model_name] = model_info
         | 
    	
        benchmark/diffusion/text-to-video/scripts/benchmark_one_datapoint.py
    CHANGED
    
    | @@ -32,10 +32,10 @@ class Results: | |
| 32 | 
             
                model: str
         | 
| 33 | 
             
                num_parameters: dict[str, int]
         | 
| 34 | 
             
                gpu_model: str
         | 
| 35 | 
            -
                num_inference_steps: int
         | 
| 36 | 
            -
                num_frames: int
         | 
| 37 | 
             
                power_limit: int
         | 
| 38 | 
             
                batch_size: int
         | 
|  | |
|  | |
| 39 | 
             
                num_prompts: int
         | 
| 40 | 
             
                total_runtime: float = 0.0
         | 
| 41 | 
             
                total_energy: float = 0.0
         | 
| @@ -119,7 +119,7 @@ def load_text_prompts( | |
| 119 | 
             
                Returns:
         | 
| 120 | 
             
                    Total number of prompts and a list of batches of prompts.
         | 
| 121 | 
             
                """
         | 
| 122 | 
            -
                dataset = json.load(open(path))["caption"]
         | 
| 123 | 
             
                if num_batches is not None:
         | 
| 124 | 
             
                    if len(dataset) < num_batches * batch_size:
         | 
| 125 | 
             
                        raise ValueError("Dataset is too small for the given number of batches.")
         | 
| @@ -151,8 +151,8 @@ def benchmark(args: argparse.Namespace) -> None: | |
| 151 |  | 
| 152 | 
             
                results_dir = Path(args.result_root) / args.model
         | 
| 153 | 
             
                results_dir.mkdir(parents=True, exist_ok=True)
         | 
| 154 | 
            -
                benchmark_name = str(results_dir / f"bs{args.batch_size}+pl{args.power_limit}")
         | 
| 155 | 
            -
                video_dir = results_dir / f"bs{args.batch_size}+pl{args.power_limit}+generated"
         | 
| 156 | 
             
                video_dir.mkdir(exist_ok=True)
         | 
| 157 |  | 
| 158 | 
             
                arg_out_filename = f"{benchmark_name}+args.json"
         | 
| @@ -190,7 +190,7 @@ def benchmark(args: argparse.Namespace) -> None: | |
| 190 | 
             
                ]
         | 
| 191 |  | 
| 192 | 
             
                torch.cuda.reset_peak_memory_stats(device="cuda:0")
         | 
| 193 | 
            -
                zeus_monitor.begin_window("benchmark",  | 
| 194 |  | 
| 195 | 
             
                # Build common parameter dict for all batches
         | 
| 196 | 
             
                params: dict[str, Any] = dict(
         | 
| @@ -208,15 +208,15 @@ def benchmark(args: argparse.Namespace) -> None: | |
| 208 |  | 
| 209 | 
             
                    params["prompt"] = intermediate.prompts
         | 
| 210 |  | 
| 211 | 
            -
                    zeus_monitor.begin_window("batch",  | 
| 212 | 
             
                    frames = pipeline(**params).frames
         | 
| 213 | 
            -
                    batch_measurements = zeus_monitor.end_window("batch",  | 
| 214 |  | 
| 215 | 
             
                    intermediate.frames = frames
         | 
| 216 | 
             
                    intermediate.batch_latency = batch_measurements.time
         | 
| 217 | 
             
                    intermediate.batch_energy = batch_measurements.total_energy
         | 
| 218 |  | 
| 219 | 
            -
                measurements = zeus_monitor.end_window("benchmark",  | 
| 220 | 
             
                peak_memory = torch.cuda.max_memory_allocated(device="cuda:0")
         | 
| 221 |  | 
| 222 | 
             
                results: list[Result] = []
         | 
| @@ -253,10 +253,10 @@ def benchmark(args: argparse.Namespace) -> None: | |
| 253 | 
             
                    model=args.model,
         | 
| 254 | 
             
                    num_parameters=count_parameters(pipeline),
         | 
| 255 | 
             
                    gpu_model=gpu_model,
         | 
| 256 | 
            -
                    num_inference_steps=args.num_inference_steps,
         | 
| 257 | 
            -
                    num_frames=args.num_frames,
         | 
| 258 | 
             
                    power_limit=args.power_limit,
         | 
| 259 | 
             
                    batch_size=args.batch_size,
         | 
|  | |
|  | |
| 260 | 
             
                    num_prompts=num_prompts,
         | 
| 261 | 
             
                    total_runtime=measurements.time,
         | 
| 262 | 
             
                    total_energy=measurements.total_energy,
         | 
|  | |
| 32 | 
             
                model: str
         | 
| 33 | 
             
                num_parameters: dict[str, int]
         | 
| 34 | 
             
                gpu_model: str
         | 
|  | |
|  | |
| 35 | 
             
                power_limit: int
         | 
| 36 | 
             
                batch_size: int
         | 
| 37 | 
            +
                num_inference_steps: int
         | 
| 38 | 
            +
                num_frames: int
         | 
| 39 | 
             
                num_prompts: int
         | 
| 40 | 
             
                total_runtime: float = 0.0
         | 
| 41 | 
             
                total_energy: float = 0.0
         | 
|  | |
| 119 | 
             
                Returns:
         | 
| 120 | 
             
                    Total number of prompts and a list of batches of prompts.
         | 
| 121 | 
             
                """
         | 
| 122 | 
            +
                dataset = json.load(open(path))["caption"] * 10
         | 
| 123 | 
             
                if num_batches is not None:
         | 
| 124 | 
             
                    if len(dataset) < num_batches * batch_size:
         | 
| 125 | 
             
                        raise ValueError("Dataset is too small for the given number of batches.")
         | 
|  | |
| 151 |  | 
| 152 | 
             
                results_dir = Path(args.result_root) / args.model
         | 
| 153 | 
             
                results_dir.mkdir(parents=True, exist_ok=True)
         | 
| 154 | 
            +
                benchmark_name = str(results_dir / f"bs{args.batch_size}+pl{args.power_limit}+steps{args.num_inference_steps}")
         | 
| 155 | 
            +
                video_dir = results_dir / f"bs{args.batch_size}+pl{args.power_limit}+steps{args.num_inference_steps}+generated"
         | 
| 156 | 
             
                video_dir.mkdir(exist_ok=True)
         | 
| 157 |  | 
| 158 | 
             
                arg_out_filename = f"{benchmark_name}+args.json"
         | 
|  | |
| 190 | 
             
                ]
         | 
| 191 |  | 
| 192 | 
             
                torch.cuda.reset_peak_memory_stats(device="cuda:0")
         | 
| 193 | 
            +
                zeus_monitor.begin_window("benchmark", sync_execution=False)
         | 
| 194 |  | 
| 195 | 
             
                # Build common parameter dict for all batches
         | 
| 196 | 
             
                params: dict[str, Any] = dict(
         | 
|  | |
| 208 |  | 
| 209 | 
             
                    params["prompt"] = intermediate.prompts
         | 
| 210 |  | 
| 211 | 
            +
                    zeus_monitor.begin_window("batch", sync_execution=False)
         | 
| 212 | 
             
                    frames = pipeline(**params).frames
         | 
| 213 | 
            +
                    batch_measurements = zeus_monitor.end_window("batch", sync_execution=False)
         | 
| 214 |  | 
| 215 | 
             
                    intermediate.frames = frames
         | 
| 216 | 
             
                    intermediate.batch_latency = batch_measurements.time
         | 
| 217 | 
             
                    intermediate.batch_energy = batch_measurements.total_energy
         | 
| 218 |  | 
| 219 | 
            +
                measurements = zeus_monitor.end_window("benchmark", sync_execution=False)
         | 
| 220 | 
             
                peak_memory = torch.cuda.max_memory_allocated(device="cuda:0")
         | 
| 221 |  | 
| 222 | 
             
                results: list[Result] = []
         | 
|  | |
| 253 | 
             
                    model=args.model,
         | 
| 254 | 
             
                    num_parameters=count_parameters(pipeline),
         | 
| 255 | 
             
                    gpu_model=gpu_model,
         | 
|  | |
|  | |
| 256 | 
             
                    power_limit=args.power_limit,
         | 
| 257 | 
             
                    batch_size=args.batch_size,
         | 
| 258 | 
            +
                    num_inference_steps=args.num_inference_steps,
         | 
| 259 | 
            +
                    num_frames=args.num_frames,
         | 
| 260 | 
             
                    num_prompts=num_prompts,
         | 
| 261 | 
             
                    total_runtime=measurements.time,
         | 
| 262 | 
             
                    total_energy=measurements.total_energy,
         | 
    	
        benchmark/diffusion/text-to-video/scripts/benchmark_one_model.py
    CHANGED
    
    | @@ -28,44 +28,46 @@ def main(args: argparse.Namespace) -> None: | |
| 28 | 
             
                print_and_write(outfile, f"Benchmarking {args.model}\n")
         | 
| 29 | 
             
                print_and_write(outfile, f"Batch sizes: {args.batch_sizes}\n")
         | 
| 30 | 
             
                print_and_write(outfile, f"Power limits: {args.power_limits}\n")
         | 
|  | |
| 31 |  | 
| 32 | 
             
                for batch_size in args.batch_sizes:
         | 
| 33 | 
             
                    for power_limit in args.power_limits:
         | 
| 34 | 
            -
                         | 
| 35 | 
            -
             | 
| 36 | 
            -
                             | 
| 37 | 
            -
                                 | 
| 38 | 
            -
             | 
| 39 | 
            -
             | 
| 40 | 
            -
             | 
| 41 | 
            -
             | 
| 42 | 
            -
             | 
| 43 | 
            -
             | 
| 44 | 
            -
             | 
| 45 | 
            -
             | 
| 46 | 
            -
             | 
| 47 | 
            -
             | 
| 48 | 
            -
             | 
| 49 | 
            -
             | 
| 50 | 
            -
             | 
| 51 | 
            -
             | 
| 52 | 
            -
             | 
| 53 | 
            -
             | 
| 54 | 
            -
             | 
| 55 | 
            -
             | 
| 56 | 
            -
             | 
| 57 | 
            -
             | 
| 58 | 
            -
             | 
| 59 | 
            -
                             | 
| 60 | 
            -
                                 | 
| 61 | 
            -
             | 
| 62 | 
            -
                                     | 
| 63 | 
            -
             | 
|  | |
| 64 |  | 
| 65 | 
            -
             | 
| 66 | 
            -
             | 
| 67 | 
            -
             | 
| 68 | 
            -
             | 
| 69 |  | 
| 70 |  | 
| 71 |  | 
| @@ -76,7 +78,7 @@ if __name__ == "__main__": | |
| 76 | 
             
                parser.add_argument("--gpu-ids", type=str, nargs="+", help="GPU IDs to use")
         | 
| 77 | 
             
                parser.add_argument("--batch-sizes", type=str, nargs="+", default=["8", "4", "2", "1"], help="Batch sizes to benchmark")
         | 
| 78 | 
             
                parser.add_argument("--power-limits", type=str, nargs="+", default=["400", "300", "200"], help="Power limits to benchmark")
         | 
| 79 | 
            -
                parser.add_argument("--num-inference-steps", type=str,  | 
| 80 | 
             
                parser.add_argument("--num-frames", type=str, required=True, help="Number of frames to generate")
         | 
| 81 | 
             
                parser.add_argument("--dataset-path", type=str, help="Path to the dataset JSON file.")
         | 
| 82 | 
             
                args = parser.parse_args()
         | 
|  | |
| 28 | 
             
                print_and_write(outfile, f"Benchmarking {args.model}\n")
         | 
| 29 | 
             
                print_and_write(outfile, f"Batch sizes: {args.batch_sizes}\n")
         | 
| 30 | 
             
                print_and_write(outfile, f"Power limits: {args.power_limits}\n")
         | 
| 31 | 
            +
                print_and_write(outfile, f"Number of inference steps: {args.num_inference_steps}\n")
         | 
| 32 |  | 
| 33 | 
             
                for batch_size in args.batch_sizes:
         | 
| 34 | 
             
                    for power_limit in args.power_limits:
         | 
| 35 | 
            +
                        for num_inference_steps in args.num_inference_steps:
         | 
| 36 | 
            +
                            print_and_write(outfile, f"{batch_size=}, {power_limit=}, {num_inference_steps=}\n", flush=True)
         | 
| 37 | 
            +
                            with subprocess.Popen(
         | 
| 38 | 
            +
                                args=[
         | 
| 39 | 
            +
                                    "docker", "run",
         | 
| 40 | 
            +
                                    "--gpus", '"device=' + ','.join(args.gpu_ids) + '"',
         | 
| 41 | 
            +
                                    "--cap-add", "SYS_ADMIN",
         | 
| 42 | 
            +
                                    "--name", f"leaderboard-t2v-{''.join(args.gpu_ids)}",
         | 
| 43 | 
            +
                                    "--rm",
         | 
| 44 | 
            +
                                    "-v", "/data/leaderboard/hfcache:/root/.cache/huggingface",
         | 
| 45 | 
            +
                                    "-v", f"{os.getcwd()}:/workspace/text-to-video",
         | 
| 46 | 
            +
                                    "mlenergy/leaderboard:diffusion-t2v",
         | 
| 47 | 
            +
                                    "--result-root", args.result_root,
         | 
| 48 | 
            +
                                    "--batch-size", batch_size,
         | 
| 49 | 
            +
                                    "--num-batches", "10",
         | 
| 50 | 
            +
                                    "--power-limit", power_limit,
         | 
| 51 | 
            +
                                    "--model", args.model,
         | 
| 52 | 
            +
                                    "--dataset-path", args.dataset_path,
         | 
| 53 | 
            +
                                    "--huggingface-token", hf_token,
         | 
| 54 | 
            +
                                    "--num-inference-steps", num_inference_steps,
         | 
| 55 | 
            +
                                    "--num-frames", args.num_frames,
         | 
| 56 | 
            +
                                ],
         | 
| 57 | 
            +
                                stdout=subprocess.PIPE,
         | 
| 58 | 
            +
                                stderr=subprocess.STDOUT,
         | 
| 59 | 
            +
                                text=True,
         | 
| 60 | 
            +
                            ) as proc:
         | 
| 61 | 
            +
                                if proc.stdout:
         | 
| 62 | 
            +
                                    i = 0
         | 
| 63 | 
            +
                                    for line in proc.stdout:
         | 
| 64 | 
            +
                                        print_and_write(outfile, line, flush=i % 50 == 0)
         | 
| 65 | 
            +
                                        i += 1
         | 
| 66 |  | 
| 67 | 
            +
                            # If proc exited with non-zero status, it's probably an OOM.
         | 
| 68 | 
            +
                            # Move on to the next batch size.
         | 
| 69 | 
            +
                            if proc.returncode != 0:
         | 
| 70 | 
            +
                                break
         | 
| 71 |  | 
| 72 |  | 
| 73 |  | 
|  | |
| 78 | 
             
                parser.add_argument("--gpu-ids", type=str, nargs="+", help="GPU IDs to use")
         | 
| 79 | 
             
                parser.add_argument("--batch-sizes", type=str, nargs="+", default=["8", "4", "2", "1"], help="Batch sizes to benchmark")
         | 
| 80 | 
             
                parser.add_argument("--power-limits", type=str, nargs="+", default=["400", "300", "200"], help="Power limits to benchmark")
         | 
| 81 | 
            +
                parser.add_argument("--num-inference-steps", type=str, nargs="+", default=["1", "2", "4", "8", "16", "25", "30", "40", "50"], help="Number of denoising steps")
         | 
| 82 | 
             
                parser.add_argument("--num-frames", type=str, required=True, help="Number of frames to generate")
         | 
| 83 | 
             
                parser.add_argument("--dataset-path", type=str, help="Path to the dataset JSON file.")
         | 
| 84 | 
             
                args = parser.parse_args()
         | 
    	
        data/diffusion/image-to-video/A100-SXM4-40GB/ali-vilab/i2vgen-xl/bs1+steps25+frames16.json
    CHANGED
    
    | @@ -1,8 +1,8 @@ | |
| 1 | 
             
            {
         | 
| 2 | 
             
              "Model": "ali-vilab/i2vgen-xl",
         | 
| 3 | 
             
              "GPU": "NVIDIA A100-SXM4-40GB",
         | 
| 4 | 
            -
              "Energy/video (J)":  | 
| 5 | 
            -
              "Batch latency (s)":  | 
| 6 | 
             
              "Batch size": 1,
         | 
| 7 | 
             
              "Denoising steps": 25,
         | 
| 8 | 
             
              "Frames": 16
         | 
|  | |
| 1 | 
             
            {
         | 
| 2 | 
             
              "Model": "ali-vilab/i2vgen-xl",
         | 
| 3 | 
             
              "GPU": "NVIDIA A100-SXM4-40GB",
         | 
| 4 | 
            +
              "Energy/video (J)": 16915.850124999997,
         | 
| 5 | 
            +
              "Batch latency (s)": 46.14208295941353,
         | 
| 6 | 
             
              "Batch size": 1,
         | 
| 7 | 
             
              "Denoising steps": 25,
         | 
| 8 | 
             
              "Frames": 16
         | 
    	
        data/diffusion/image-to-video/A100-SXM4-40GB/ali-vilab/i2vgen-xl/bs2+steps25+frames16.json
    CHANGED
    
    | @@ -1,8 +1,8 @@ | |
| 1 | 
             
            {
         | 
| 2 | 
             
              "Model": "ali-vilab/i2vgen-xl",
         | 
| 3 | 
             
              "GPU": "NVIDIA A100-SXM4-40GB",
         | 
| 4 | 
            -
              "Energy/video (J)":  | 
| 5 | 
            -
              "Batch latency (s)":  | 
| 6 | 
             
              "Batch size": 2,
         | 
| 7 | 
             
              "Denoising steps": 25,
         | 
| 8 | 
             
              "Frames": 16
         | 
|  | |
| 1 | 
             
            {
         | 
| 2 | 
             
              "Model": "ali-vilab/i2vgen-xl",
         | 
| 3 | 
             
              "GPU": "NVIDIA A100-SXM4-40GB",
         | 
| 4 | 
            +
              "Energy/video (J)": 16496.045437499997,
         | 
| 5 | 
            +
              "Batch latency (s)": 89.03019031882286,
         | 
| 6 | 
             
              "Batch size": 2,
         | 
| 7 | 
             
              "Denoising steps": 25,
         | 
| 8 | 
             
              "Frames": 16
         | 
    	
        data/diffusion/image-to-video/A100-SXM4-40GB/stabilityai/stable-video-diffusion-img2vid-xt/bs1+steps25+frames25.json
    CHANGED
    
    | @@ -1,8 +1,8 @@ | |
| 1 | 
             
            {
         | 
| 2 | 
             
              "Model": "stabilityai/stable-video-diffusion-img2vid-xt",
         | 
| 3 | 
             
              "GPU": "NVIDIA A100-SXM4-40GB",
         | 
| 4 | 
            -
              "Energy/video (J)":  | 
| 5 | 
            -
              "Batch latency (s)": 42. | 
| 6 | 
             
              "Batch size": 1,
         | 
| 7 | 
             
              "Denoising steps": 25,
         | 
| 8 | 
             
              "Frames": 25
         | 
|  | |
| 1 | 
             
            {
         | 
| 2 | 
             
              "Model": "stabilityai/stable-video-diffusion-img2vid-xt",
         | 
| 3 | 
             
              "GPU": "NVIDIA A100-SXM4-40GB",
         | 
| 4 | 
            +
              "Energy/video (J)": 15709.767625000095,
         | 
| 5 | 
            +
              "Batch latency (s)": 42.397395104169846,
         | 
| 6 | 
             
              "Batch size": 1,
         | 
| 7 | 
             
              "Denoising steps": 25,
         | 
| 8 | 
             
              "Frames": 25
         | 
    	
        data/diffusion/image-to-video/A100-SXM4-40GB/stabilityai/stable-video-diffusion-img2vid-xt/bs2+steps25+frames25.json
    ADDED
    
    | @@ -0,0 +1,9 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "Model": "stabilityai/stable-video-diffusion-img2vid-xt",
         | 
| 3 | 
            +
              "GPU": "NVIDIA A100-SXM4-40GB",
         | 
| 4 | 
            +
              "Energy/video (J)": 15291.016625000047,
         | 
| 5 | 
            +
              "Batch latency (s)": 82.90474811196327,
         | 
| 6 | 
            +
              "Batch size": 2,
         | 
| 7 | 
            +
              "Denoising steps": 25,
         | 
| 8 | 
            +
              "Frames": 25
         | 
| 9 | 
            +
            }
         | 
    	
        data/diffusion/image-to-video/A100-SXM4-40GB/stabilityai/stable-video-diffusion-img2vid-xt/bs3+steps25+frames25.json
    ADDED
    
    | @@ -0,0 +1,9 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "Model": "stabilityai/stable-video-diffusion-img2vid-xt",
         | 
| 3 | 
            +
              "GPU": "NVIDIA A100-SXM4-40GB",
         | 
| 4 | 
            +
              "Energy/video (J)": 14761.389999999976,
         | 
| 5 | 
            +
              "Batch latency (s)": 120.65004900523594,
         | 
| 6 | 
            +
              "Batch size": 3,
         | 
| 7 | 
            +
              "Denoising steps": 25,
         | 
| 8 | 
            +
              "Frames": 25
         | 
| 9 | 
            +
            }
         | 
    	
        data/diffusion/image-to-video/A100-SXM4-40GB/stabilityai/stable-video-diffusion-img2vid/bs1+steps25+frames14.json
    CHANGED
    
    | @@ -1,8 +1,8 @@ | |
| 1 | 
             
            {
         | 
| 2 | 
             
              "Model": "stabilityai/stable-video-diffusion-img2vid",
         | 
| 3 | 
             
              "GPU": "NVIDIA A100-SXM4-40GB",
         | 
| 4 | 
            -
              "Energy/video (J)":  | 
| 5 | 
            -
              "Batch latency (s)": 24. | 
| 6 | 
             
              "Batch size": 1,
         | 
| 7 | 
             
              "Denoising steps": 25,
         | 
| 8 | 
             
              "Frames": 14
         | 
|  | |
| 1 | 
             
            {
         | 
| 2 | 
             
              "Model": "stabilityai/stable-video-diffusion-img2vid",
         | 
| 3 | 
             
              "GPU": "NVIDIA A100-SXM4-40GB",
         | 
| 4 | 
            +
              "Energy/video (J)": 9066.434124999912,
         | 
| 5 | 
            +
              "Batch latency (s)": 24.369865357875824,
         | 
| 6 | 
             
              "Batch size": 1,
         | 
| 7 | 
             
              "Denoising steps": 25,
         | 
| 8 | 
             
              "Frames": 14
         | 
    	
        data/diffusion/image-to-video/A100-SXM4-40GB/stabilityai/stable-video-diffusion-img2vid/bs2+steps25+frames14.json
    ADDED
    
    | @@ -0,0 +1,9 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "Model": "stabilityai/stable-video-diffusion-img2vid",
         | 
| 3 | 
            +
              "GPU": "NVIDIA A100-SXM4-40GB",
         | 
| 4 | 
            +
              "Energy/video (J)": 8835.22312499996,
         | 
| 5 | 
            +
              "Batch latency (s)": 47.65615049004555,
         | 
| 6 | 
            +
              "Batch size": 2,
         | 
| 7 | 
            +
              "Denoising steps": 25,
         | 
| 8 | 
            +
              "Frames": 14
         | 
| 9 | 
            +
            }
         | 
    	
        data/diffusion/image-to-video/A100-SXM4-40GB/stabilityai/stable-video-diffusion-img2vid/bs3+steps25+frames14.json
    ADDED
    
    | @@ -0,0 +1,9 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "Model": "stabilityai/stable-video-diffusion-img2vid",
         | 
| 3 | 
            +
              "GPU": "NVIDIA A100-SXM4-40GB",
         | 
| 4 | 
            +
              "Energy/video (J)": 8683.536285714292,
         | 
| 5 | 
            +
              "Batch latency (s)": 70.55723374230521,
         | 
| 6 | 
            +
              "Batch size": 3,
         | 
| 7 | 
            +
              "Denoising steps": 25,
         | 
| 8 | 
            +
              "Frames": 14
         | 
| 9 | 
            +
            }
         | 
    	
        data/diffusion/image-to-video/H100 80GB HBM3/ali-vilab/i2vgen-xl/bs1+steps25+frames16.json
    CHANGED
    
    | @@ -1,8 +1,8 @@ | |
| 1 | 
             
            {
         | 
| 2 | 
             
              "Model": "ali-vilab/i2vgen-xl",
         | 
| 3 | 
             
              "GPU": "NVIDIA H100 80GB HBM3",
         | 
| 4 | 
            -
              "Energy/video (J)":  | 
| 5 | 
            -
              "Batch latency (s)":  | 
| 6 | 
             
              "Batch size": 1,
         | 
| 7 | 
             
              "Denoising steps": 25,
         | 
| 8 | 
             
              "Frames": 16
         | 
|  | |
| 1 | 
             
            {
         | 
| 2 | 
             
              "Model": "ali-vilab/i2vgen-xl",
         | 
| 3 | 
             
              "GPU": "NVIDIA H100 80GB HBM3",
         | 
| 4 | 
            +
              "Energy/video (J)": 14867.419125000015,
         | 
| 5 | 
            +
              "Batch latency (s)": 23.717748790979385,
         | 
| 6 | 
             
              "Batch size": 1,
         | 
| 7 | 
             
              "Denoising steps": 25,
         | 
| 8 | 
             
              "Frames": 16
         | 
    	
        data/diffusion/image-to-video/H100 80GB HBM3/ali-vilab/i2vgen-xl/bs2+steps25+frames16.json
    CHANGED
    
    | @@ -1,8 +1,8 @@ | |
| 1 | 
             
            {
         | 
| 2 | 
             
              "Model": "ali-vilab/i2vgen-xl",
         | 
| 3 | 
             
              "GPU": "NVIDIA H100 80GB HBM3",
         | 
| 4 | 
            -
              "Energy/video (J)":  | 
| 5 | 
            -
              "Batch latency (s)":  | 
| 6 | 
             
              "Batch size": 2,
         | 
| 7 | 
             
              "Denoising steps": 25,
         | 
| 8 | 
             
              "Frames": 16
         | 
|  | |
| 1 | 
             
            {
         | 
| 2 | 
             
              "Model": "ali-vilab/i2vgen-xl",
         | 
| 3 | 
             
              "GPU": "NVIDIA H100 80GB HBM3",
         | 
| 4 | 
            +
              "Energy/video (J)": 14348.508499999996,
         | 
| 5 | 
            +
              "Batch latency (s)": 44.71498331427574,
         | 
| 6 | 
             
              "Batch size": 2,
         | 
| 7 | 
             
              "Denoising steps": 25,
         | 
| 8 | 
             
              "Frames": 16
         | 
    	
        data/diffusion/image-to-video/H100 80GB HBM3/stabilityai/stable-video-diffusion-img2vid-xt/bs1+steps25+frames25.json
    CHANGED
    
    | @@ -1,8 +1,8 @@ | |
| 1 | 
             
            {
         | 
| 2 | 
             
              "Model": "stabilityai/stable-video-diffusion-img2vid-xt",
         | 
| 3 | 
             
              "GPU": "NVIDIA H100 80GB HBM3",
         | 
| 4 | 
            -
              "Energy/video (J)":  | 
| 5 | 
            -
              "Batch latency (s)": 20. | 
| 6 | 
             
              "Batch size": 1,
         | 
| 7 | 
             
              "Denoising steps": 25,
         | 
| 8 | 
             
              "Frames": 25
         | 
|  | |
| 1 | 
             
            {
         | 
| 2 | 
             
              "Model": "stabilityai/stable-video-diffusion-img2vid-xt",
         | 
| 3 | 
             
              "GPU": "NVIDIA H100 80GB HBM3",
         | 
| 4 | 
            +
              "Energy/video (J)": 13392.813624999952,
         | 
| 5 | 
            +
              "Batch latency (s)": 20.788252592086792,
         | 
| 6 | 
             
              "Batch size": 1,
         | 
| 7 | 
             
              "Denoising steps": 25,
         | 
| 8 | 
             
              "Frames": 25
         | 
    	
        data/diffusion/image-to-video/H100 80GB HBM3/stabilityai/stable-video-diffusion-img2vid-xt/bs2+steps25+frames25.json
    ADDED
    
    | @@ -0,0 +1,9 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "Model": "stabilityai/stable-video-diffusion-img2vid-xt",
         | 
| 3 | 
            +
              "GPU": "NVIDIA H100 80GB HBM3",
         | 
| 4 | 
            +
              "Energy/video (J)": 12901.83275000006,
         | 
| 5 | 
            +
              "Batch latency (s)": 39.99498334527016,
         | 
| 6 | 
            +
              "Batch size": 2,
         | 
| 7 | 
            +
              "Denoising steps": 25,
         | 
| 8 | 
            +
              "Frames": 25
         | 
| 9 | 
            +
            }
         | 
    	
        data/diffusion/image-to-video/H100 80GB HBM3/stabilityai/stable-video-diffusion-img2vid-xt/bs3+steps25+frames25.json
    ADDED
    
    | @@ -0,0 +1,9 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "Model": "stabilityai/stable-video-diffusion-img2vid-xt",
         | 
| 3 | 
            +
              "GPU": "NVIDIA H100 80GB HBM3",
         | 
| 4 | 
            +
              "Energy/video (J)": 12790.552809523862,
         | 
| 5 | 
            +
              "Batch latency (s)": 59.380911929266794,
         | 
| 6 | 
            +
              "Batch size": 3,
         | 
| 7 | 
            +
              "Denoising steps": 25,
         | 
| 8 | 
            +
              "Frames": 25
         | 
| 9 | 
            +
            }
         | 
    	
        data/diffusion/image-to-video/H100 80GB HBM3/stabilityai/stable-video-diffusion-img2vid/bs1+steps25+frames14.json
    CHANGED
    
    | @@ -1,8 +1,8 @@ | |
| 1 | 
             
            {
         | 
| 2 | 
             
              "Model": "stabilityai/stable-video-diffusion-img2vid",
         | 
| 3 | 
             
              "GPU": "NVIDIA H100 80GB HBM3",
         | 
| 4 | 
            -
              "Energy/video (J)":  | 
| 5 | 
            -
              "Batch latency (s)": 12. | 
| 6 | 
             
              "Batch size": 1,
         | 
| 7 | 
             
              "Denoising steps": 25,
         | 
| 8 | 
             
              "Frames": 14
         | 
|  | |
| 1 | 
             
            {
         | 
| 2 | 
             
              "Model": "stabilityai/stable-video-diffusion-img2vid",
         | 
| 3 | 
             
              "GPU": "NVIDIA H100 80GB HBM3",
         | 
| 4 | 
            +
              "Energy/video (J)": 7623.074500000104,
         | 
| 5 | 
            +
              "Batch latency (s)": 12.191031396389008,
         | 
| 6 | 
             
              "Batch size": 1,
         | 
| 7 | 
             
              "Denoising steps": 25,
         | 
| 8 | 
             
              "Frames": 14
         | 
    	
        data/diffusion/image-to-video/H100 80GB HBM3/stabilityai/stable-video-diffusion-img2vid/bs2+steps25+frames14.json
    ADDED
    
    | @@ -0,0 +1,9 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "Model": "stabilityai/stable-video-diffusion-img2vid",
         | 
| 3 | 
            +
              "GPU": "NVIDIA H100 80GB HBM3",
         | 
| 4 | 
            +
              "Energy/video (J)": 7416.721437499975,
         | 
| 5 | 
            +
              "Batch latency (s)": 23.368041068315506,
         | 
| 6 | 
            +
              "Batch size": 2,
         | 
| 7 | 
            +
              "Denoising steps": 25,
         | 
| 8 | 
            +
              "Frames": 14
         | 
| 9 | 
            +
            }
         | 
    	
        data/diffusion/image-to-video/H100 80GB HBM3/stabilityai/stable-video-diffusion-img2vid/bs3+steps25+frames14.json
    ADDED
    
    | @@ -0,0 +1,9 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "Model": "stabilityai/stable-video-diffusion-img2vid",
         | 
| 3 | 
            +
              "GPU": "NVIDIA H100 80GB HBM3",
         | 
| 4 | 
            +
              "Energy/video (J)": 7354.00133333333,
         | 
| 5 | 
            +
              "Batch latency (s)": 34.5100462777274,
         | 
| 6 | 
            +
              "Batch size": 3,
         | 
| 7 | 
            +
              "Denoising steps": 25,
         | 
| 8 | 
            +
              "Frames": 14
         | 
| 9 | 
            +
            }
         | 
    	
        data/diffusion/text-to-image/A100-SXM4-40GB/kandinsky-community/kandinsky-2-2-decoder/bs1+steps25.json
    CHANGED
    
    | @@ -1,8 +1,8 @@ | |
| 1 | 
             
            {
         | 
| 2 | 
             
              "Model": "kandinsky-community/kandinsky-2-2-decoder",
         | 
| 3 | 
             
              "GPU": "NVIDIA A100-SXM4-40GB",
         | 
| 4 | 
            -
              "Energy/image (J)":  | 
| 5 | 
            -
              "Batch latency (s)": 1. | 
| 6 | 
             
              "Batch size": 1,
         | 
| 7 | 
             
              "Denoising steps": 25
         | 
| 8 | 
             
            }
         | 
|  | |
| 1 | 
             
            {
         | 
| 2 | 
             
              "Model": "kandinsky-community/kandinsky-2-2-decoder",
         | 
| 3 | 
             
              "GPU": "NVIDIA A100-SXM4-40GB",
         | 
| 4 | 
            +
              "Energy/image (J)": 324.06850000005215,
         | 
| 5 | 
            +
              "Batch latency (s)": 1.6537675857543945,
         | 
| 6 | 
             
              "Batch size": 1,
         | 
| 7 | 
             
              "Denoising steps": 25
         | 
| 8 | 
             
            }
         | 
    	
        data/diffusion/text-to-image/A100-SXM4-40GB/kandinsky-community/kandinsky-2-2-decoder/bs16+steps25.json
    CHANGED
    
    | @@ -1,8 +1,8 @@ | |
| 1 | 
             
            {
         | 
| 2 | 
             
              "Model": "kandinsky-community/kandinsky-2-2-decoder",
         | 
| 3 | 
             
              "GPU": "NVIDIA A100-SXM4-40GB",
         | 
| 4 | 
            -
              "Energy/image (J)":  | 
| 5 | 
            -
              "Batch latency (s)": 7. | 
| 6 | 
             
              "Batch size": 16,
         | 
| 7 | 
             
              "Denoising steps": 25
         | 
| 8 | 
             
            }
         | 
|  | |
| 1 | 
             
            {
         | 
| 2 | 
             
              "Model": "kandinsky-community/kandinsky-2-2-decoder",
         | 
| 3 | 
             
              "GPU": "NVIDIA A100-SXM4-40GB",
         | 
| 4 | 
            +
              "Energy/image (J)": 172.51030000000029,
         | 
| 5 | 
            +
              "Batch latency (s)": 7.375234842300415,
         | 
| 6 | 
             
              "Batch size": 16,
         | 
| 7 | 
             
              "Denoising steps": 25
         | 
| 8 | 
             
            }
         | 
    	
        data/diffusion/text-to-image/A100-SXM4-40GB/kandinsky-community/kandinsky-2-2-decoder/bs2+steps25.json
    CHANGED
    
    | @@ -1,8 +1,8 @@ | |
| 1 | 
             
            {
         | 
| 2 | 
             
              "Model": "kandinsky-community/kandinsky-2-2-decoder",
         | 
| 3 | 
             
              "GPU": "NVIDIA A100-SXM4-40GB",
         | 
| 4 | 
            -
              "Energy/image (J)":  | 
| 5 | 
            -
              "Batch latency (s)": 1. | 
| 6 | 
             
              "Batch size": 2,
         | 
| 7 | 
             
              "Denoising steps": 25
         | 
| 8 | 
             
            }
         | 
|  | |
| 1 | 
             
            {
         | 
| 2 | 
             
              "Model": "kandinsky-community/kandinsky-2-2-decoder",
         | 
| 3 | 
             
              "GPU": "NVIDIA A100-SXM4-40GB",
         | 
| 4 | 
            +
              "Energy/image (J)": 230.3378000000026,
         | 
| 5 | 
            +
              "Batch latency (s)": 1.5861663103103638,
         | 
| 6 | 
             
              "Batch size": 2,
         | 
| 7 | 
             
              "Denoising steps": 25
         | 
| 8 | 
             
            }
         | 
    	
        data/diffusion/text-to-image/A100-SXM4-40GB/kandinsky-community/kandinsky-2-2-decoder/bs32+steps25.json
    ADDED
    
    | @@ -0,0 +1,8 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "Model": "kandinsky-community/kandinsky-2-2-decoder",
         | 
| 3 | 
            +
              "GPU": "NVIDIA A100-SXM4-40GB",
         | 
| 4 | 
            +
              "Energy/image (J)": 163.0797656249997,
         | 
| 5 | 
            +
              "Batch latency (s)": 13.998618459701538,
         | 
| 6 | 
            +
              "Batch size": 32,
         | 
| 7 | 
            +
              "Denoising steps": 25
         | 
| 8 | 
            +
            }
         | 
    	
        data/diffusion/text-to-image/A100-SXM4-40GB/kandinsky-community/kandinsky-2-2-decoder/bs4+steps25.json
    CHANGED
    
    | @@ -1,8 +1,8 @@ | |
| 1 | 
             
            {
         | 
| 2 | 
             
              "Model": "kandinsky-community/kandinsky-2-2-decoder",
         | 
| 3 | 
             
              "GPU": "NVIDIA A100-SXM4-40GB",
         | 
| 4 | 
            -
              "Energy/image (J)":  | 
| 5 | 
            -
              "Batch latency (s)": 2. | 
| 6 | 
             
              "Batch size": 4,
         | 
| 7 | 
             
              "Denoising steps": 25
         | 
| 8 | 
             
            }
         | 
|  | |
| 1 | 
             
            {
         | 
| 2 | 
             
              "Model": "kandinsky-community/kandinsky-2-2-decoder",
         | 
| 3 | 
             
              "GPU": "NVIDIA A100-SXM4-40GB",
         | 
| 4 | 
            +
              "Energy/image (J)": 200.16462499999906,
         | 
| 5 | 
            +
              "Batch latency (s)": 2.299217462539673,
         | 
| 6 | 
             
              "Batch size": 4,
         | 
| 7 | 
             
              "Denoising steps": 25
         | 
| 8 | 
             
            }
         | 
    	
        data/diffusion/text-to-image/A100-SXM4-40GB/kandinsky-community/kandinsky-2-2-decoder/bs8+steps25.json
    CHANGED
    
    | @@ -1,8 +1,8 @@ | |
| 1 | 
             
            {
         | 
| 2 | 
             
              "Model": "kandinsky-community/kandinsky-2-2-decoder",
         | 
| 3 | 
             
              "GPU": "NVIDIA A100-SXM4-40GB",
         | 
| 4 | 
            -
              "Energy/image (J)":  | 
| 5 | 
            -
              "Batch latency (s)": 4. | 
| 6 | 
             
              "Batch size": 8,
         | 
| 7 | 
             
              "Denoising steps": 25
         | 
| 8 | 
             
            }
         | 
|  | |
| 1 | 
             
            {
         | 
| 2 | 
             
              "Model": "kandinsky-community/kandinsky-2-2-decoder",
         | 
| 3 | 
             
              "GPU": "NVIDIA A100-SXM4-40GB",
         | 
| 4 | 
            +
              "Energy/image (J)": 184.9021625000052,
         | 
| 5 | 
            +
              "Batch latency (s)": 4.0124232292175295,
         | 
| 6 | 
             
              "Batch size": 8,
         | 
| 7 | 
             
              "Denoising steps": 25
         | 
| 8 | 
             
            }
         | 
    	
        data/diffusion/text-to-image/A100-SXM4-40GB/kandinsky-community/kandinsky-3/bs1+steps25.json
    CHANGED
    
    | @@ -1,8 +1,8 @@ | |
| 1 | 
             
            {
         | 
| 2 | 
             
              "Model": "kandinsky-community/kandinsky-3",
         | 
| 3 | 
             
              "GPU": "NVIDIA A100-SXM4-40GB",
         | 
| 4 | 
            -
              "Energy/image (J)":  | 
| 5 | 
            -
              "Batch latency (s)": 3. | 
| 6 | 
             
              "Batch size": 1,
         | 
| 7 | 
             
              "Denoising steps": 25
         | 
| 8 | 
             
            }
         | 
|  | |
| 1 | 
             
            {
         | 
| 2 | 
             
              "Model": "kandinsky-community/kandinsky-3",
         | 
| 3 | 
             
              "GPU": "NVIDIA A100-SXM4-40GB",
         | 
| 4 | 
            +
              "Energy/image (J)": 930.2532999999821,
         | 
| 5 | 
            +
              "Batch latency (s)": 3.0359585523605346,
         | 
| 6 | 
             
              "Batch size": 1,
         | 
| 7 | 
             
              "Denoising steps": 25
         | 
| 8 | 
             
            }
         | 
    	
        data/diffusion/text-to-image/A100-SXM4-40GB/kandinsky-community/kandinsky-3/bs2+steps25.json
    CHANGED
    
    | @@ -1,8 +1,8 @@ | |
| 1 | 
             
            {
         | 
| 2 | 
             
              "Model": "kandinsky-community/kandinsky-3",
         | 
| 3 | 
             
              "GPU": "NVIDIA A100-SXM4-40GB",
         | 
| 4 | 
            -
              "Energy/image (J)":  | 
| 5 | 
            -
              "Batch latency (s)": 5. | 
| 6 | 
             
              "Batch size": 2,
         | 
| 7 | 
             
              "Denoising steps": 25
         | 
| 8 | 
             
            }
         | 
|  | |
| 1 | 
             
            {
         | 
| 2 | 
             
              "Model": "kandinsky-community/kandinsky-3",
         | 
| 3 | 
             
              "GPU": "NVIDIA A100-SXM4-40GB",
         | 
| 4 | 
            +
              "Energy/image (J)": 895.7575500000036,
         | 
| 5 | 
            +
              "Batch latency (s)": 5.261959171295166,
         | 
| 6 | 
             
              "Batch size": 2,
         | 
| 7 | 
             
              "Denoising steps": 25
         | 
| 8 | 
             
            }
         | 
    	
        data/diffusion/text-to-image/A100-SXM4-40GB/prompthero/openjourney-v4/bs1+steps25.json
    CHANGED
    
    | @@ -1,8 +1,8 @@ | |
| 1 | 
             
            {
         | 
| 2 | 
             
              "Model": "prompthero/openjourney-v4",
         | 
| 3 | 
             
              "GPU": "NVIDIA A100-SXM4-40GB",
         | 
| 4 | 
            -
              "Energy/image (J)":  | 
| 5 | 
            -
              "Batch latency (s)":  | 
| 6 | 
             
              "Batch size": 1,
         | 
| 7 | 
             
              "Denoising steps": 25
         | 
| 8 | 
             
            }
         | 
|  | |
| 1 | 
             
            {
         | 
| 2 | 
             
              "Model": "prompthero/openjourney-v4",
         | 
| 3 | 
             
              "GPU": "NVIDIA A100-SXM4-40GB",
         | 
| 4 | 
            +
              "Energy/image (J)": 227.21699999999254,
         | 
| 5 | 
            +
              "Batch latency (s)": 0.9210062503814698,
         | 
| 6 | 
             
              "Batch size": 1,
         | 
| 7 | 
             
              "Denoising steps": 25
         | 
| 8 | 
             
            }
         | 
    	
        data/diffusion/text-to-image/A100-SXM4-40GB/prompthero/openjourney-v4/bs16+steps25.json
    CHANGED
    
    | @@ -1,8 +1,8 @@ | |
| 1 | 
             
            {
         | 
| 2 | 
             
              "Model": "prompthero/openjourney-v4",
         | 
| 3 | 
             
              "GPU": "NVIDIA A100-SXM4-40GB",
         | 
| 4 | 
            -
              "Energy/image (J)":  | 
| 5 | 
            -
              "Batch latency (s)": 6. | 
| 6 | 
             
              "Batch size": 16,
         | 
| 7 | 
             
              "Denoising steps": 25
         | 
| 8 | 
             
            }
         | 
|  | |
| 1 | 
             
            {
         | 
| 2 | 
             
              "Model": "prompthero/openjourney-v4",
         | 
| 3 | 
             
              "GPU": "NVIDIA A100-SXM4-40GB",
         | 
| 4 | 
            +
              "Energy/image (J)": 156.51368749999673,
         | 
| 5 | 
            +
              "Batch latency (s)": 6.559858226776123,
         | 
| 6 | 
             
              "Batch size": 16,
         | 
| 7 | 
             
              "Denoising steps": 25
         | 
| 8 | 
             
            }
         | 
    	
        data/diffusion/text-to-image/A100-SXM4-40GB/prompthero/openjourney-v4/bs2+steps25.json
    CHANGED
    
    | @@ -1,8 +1,8 @@ | |
| 1 | 
             
            {
         | 
| 2 | 
             
              "Model": "prompthero/openjourney-v4",
         | 
| 3 | 
             
              "GPU": "NVIDIA A100-SXM4-40GB",
         | 
| 4 | 
            -
              "Energy/image (J)":  | 
| 5 | 
            -
              "Batch latency (s)": 1. | 
| 6 | 
             
              "Batch size": 2,
         | 
| 7 | 
             
              "Denoising steps": 25
         | 
| 8 | 
             
            }
         | 
|  | |
| 1 | 
             
            {
         | 
| 2 | 
             
              "Model": "prompthero/openjourney-v4",
         | 
| 3 | 
             
              "GPU": "NVIDIA A100-SXM4-40GB",
         | 
| 4 | 
            +
              "Energy/image (J)": 188.78500000000932,
         | 
| 5 | 
            +
              "Batch latency (s)": 1.1187455892562865,
         | 
| 6 | 
             
              "Batch size": 2,
         | 
| 7 | 
             
              "Denoising steps": 25
         | 
| 8 | 
             
            }
         | 
    	
        data/diffusion/text-to-image/A100-SXM4-40GB/prompthero/openjourney-v4/bs32+steps25.json
    ADDED
    
    | @@ -0,0 +1,8 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "Model": "prompthero/openjourney-v4",
         | 
| 3 | 
            +
              "GPU": "NVIDIA A100-SXM4-40GB",
         | 
| 4 | 
            +
              "Energy/image (J)": 154.23499999999768,
         | 
| 5 | 
            +
              "Batch latency (s)": 12.850126147270203,
         | 
| 6 | 
            +
              "Batch size": 32,
         | 
| 7 | 
            +
              "Denoising steps": 25
         | 
| 8 | 
            +
            }
         | 
    	
        data/diffusion/text-to-image/A100-SXM4-40GB/prompthero/openjourney-v4/bs4+steps25.json
    CHANGED
    
    | @@ -1,8 +1,8 @@ | |
| 1 | 
             
            {
         | 
| 2 | 
             
              "Model": "prompthero/openjourney-v4",
         | 
| 3 | 
             
              "GPU": "NVIDIA A100-SXM4-40GB",
         | 
| 4 | 
            -
              "Energy/image (J)":  | 
| 5 | 
            -
              "Batch latency (s)": 1. | 
| 6 | 
             
              "Batch size": 4,
         | 
| 7 | 
             
              "Denoising steps": 25
         | 
| 8 | 
             
            }
         | 
|  | |
| 1 | 
             
            {
         | 
| 2 | 
             
              "Model": "prompthero/openjourney-v4",
         | 
| 3 | 
             
              "GPU": "NVIDIA A100-SXM4-40GB",
         | 
| 4 | 
            +
              "Energy/image (J)": 175.33082500000017,
         | 
| 5 | 
            +
              "Batch latency (s)": 1.8664743423461914,
         | 
| 6 | 
             
              "Batch size": 4,
         | 
| 7 | 
             
              "Denoising steps": 25
         | 
| 8 | 
             
            }
         | 
    	
        data/diffusion/text-to-image/A100-SXM4-40GB/prompthero/openjourney-v4/bs64+steps25.json
    ADDED
    
    | @@ -0,0 +1,8 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "Model": "prompthero/openjourney-v4",
         | 
| 3 | 
            +
              "GPU": "NVIDIA A100-SXM4-40GB",
         | 
| 4 | 
            +
              "Energy/image (J)": 150.57691875000017,
         | 
| 5 | 
            +
              "Batch latency (s)": 25.000647592544556,
         | 
| 6 | 
            +
              "Batch size": 64,
         | 
| 7 | 
            +
              "Denoising steps": 25
         | 
| 8 | 
            +
            }
         | 
    	
        data/diffusion/text-to-image/A100-SXM4-40GB/prompthero/openjourney-v4/bs8+steps25.json
    CHANGED
    
    | @@ -1,8 +1,8 @@ | |
| 1 | 
             
            {
         | 
| 2 | 
             
              "Model": "prompthero/openjourney-v4",
         | 
| 3 | 
             
              "GPU": "NVIDIA A100-SXM4-40GB",
         | 
| 4 | 
            -
              "Energy/image (J)":  | 
| 5 | 
            -
              "Batch latency (s)": 3. | 
| 6 | 
             
              "Batch size": 8,
         | 
| 7 | 
             
              "Denoising steps": 25
         | 
| 8 | 
             
            }
         | 
|  | |
| 1 | 
             
            {
         | 
| 2 | 
             
              "Model": "prompthero/openjourney-v4",
         | 
| 3 | 
             
              "GPU": "NVIDIA A100-SXM4-40GB",
         | 
| 4 | 
            +
              "Energy/image (J)": 163.7534500000067,
         | 
| 5 | 
            +
              "Batch latency (s)": 3.423132634162903,
         | 
| 6 | 
             
              "Batch size": 8,
         | 
| 7 | 
             
              "Denoising steps": 25
         | 
| 8 | 
             
            }
         | 
    	
        data/diffusion/text-to-image/A100-SXM4-40GB/segmind/SSD-1B/bs1+steps25.json
    CHANGED
    
    | @@ -1,8 +1,8 @@ | |
| 1 | 
             
            {
         | 
| 2 | 
             
              "Model": "segmind/SSD-1B",
         | 
| 3 | 
             
              "GPU": "NVIDIA A100-SXM4-40GB",
         | 
| 4 | 
            -
              "Energy/image (J)":  | 
| 5 | 
            -
              "Batch latency (s)": 1. | 
| 6 | 
             
              "Batch size": 1,
         | 
| 7 | 
             
              "Denoising steps": 25
         | 
| 8 | 
             
            }
         | 
|  | |
| 1 | 
             
            {
         | 
| 2 | 
             
              "Model": "segmind/SSD-1B",
         | 
| 3 | 
             
              "GPU": "NVIDIA A100-SXM4-40GB",
         | 
| 4 | 
            +
              "Energy/image (J)": 745.7899999999441,
         | 
| 5 | 
            +
              "Batch latency (s)": 1.9644724607467652,
         | 
| 6 | 
             
              "Batch size": 1,
         | 
| 7 | 
             
              "Denoising steps": 25
         | 
| 8 | 
             
            }
         | 
    	
        data/diffusion/text-to-image/A100-SXM4-40GB/segmind/SSD-1B/bs2+steps25.json
    CHANGED
    
    | @@ -1,8 +1,8 @@ | |
| 1 | 
             
            {
         | 
| 2 | 
             
              "Model": "segmind/SSD-1B",
         | 
| 3 | 
             
              "GPU": "NVIDIA A100-SXM4-40GB",
         | 
| 4 | 
            -
              "Energy/image (J)":  | 
| 5 | 
            -
              "Batch latency (s)": 3. | 
| 6 | 
             
              "Batch size": 2,
         | 
| 7 | 
             
              "Denoising steps": 25
         | 
| 8 | 
             
            }
         | 
|  | |
| 1 | 
             
            {
         | 
| 2 | 
             
              "Model": "segmind/SSD-1B",
         | 
| 3 | 
             
              "GPU": "NVIDIA A100-SXM4-40GB",
         | 
| 4 | 
            +
              "Energy/image (J)": 700.4580500000156,
         | 
| 5 | 
            +
              "Batch latency (s)": 3.6897377252578734,
         | 
| 6 | 
             
              "Batch size": 2,
         | 
| 7 | 
             
              "Denoising steps": 25
         | 
| 8 | 
             
            }
         | 
    	
        data/diffusion/text-to-image/A100-SXM4-40GB/segmind/SSD-1B/bs4+steps25.json
    CHANGED
    
    | @@ -1,8 +1,8 @@ | |
| 1 | 
             
            {
         | 
| 2 | 
             
              "Model": "segmind/SSD-1B",
         | 
| 3 | 
             
              "GPU": "NVIDIA A100-SXM4-40GB",
         | 
| 4 | 
            -
              "Energy/image (J)":  | 
| 5 | 
            -
              "Batch latency (s)": 7. | 
| 6 | 
             
              "Batch size": 4,
         | 
| 7 | 
             
              "Denoising steps": 25
         | 
| 8 | 
             
            }
         | 
|  | |
| 1 | 
             
            {
         | 
| 2 | 
             
              "Model": "segmind/SSD-1B",
         | 
| 3 | 
             
              "GPU": "NVIDIA A100-SXM4-40GB",
         | 
| 4 | 
            +
              "Energy/image (J)": 688.6121250000084,
         | 
| 5 | 
            +
              "Batch latency (s)": 7.168970584869385,
         | 
| 6 | 
             
              "Batch size": 4,
         | 
| 7 | 
             
              "Denoising steps": 25
         | 
| 8 | 
             
            }
         | 
