FLUX.1-dev-fa3-aoti-blocks-load

Running on Zero

App Files Files Community

cbensimon HF Staff commited on Sep 10

Commit

4e94151

1 Parent(s): 39286c5

AOTI load

Browse files

Files changed (3) hide show

aoti.py +19 -0
app.py +3 -2
optimization.py +0 -67

aoti.py ADDED Viewed

	@@ -0,0 +1,19 @@

+"""
+"""
+import torch
+from huggingface_hub import hf_hub_download
+from spaces.zero.torch.aoti import ZeroGPUCompiledModel
+from spaces.zero.torch.aoti import ZeroGPUWeights
+import fa3
+def aoti_load(module: torch.nn.Module, repo_id: str):
+    repeated_blocks = module._repeated_blocks
+    aoti_files = {name: hf_hub_download(repo_id, f'{name}.pt2') for name in repeated_blocks}
+    for block_name, aoti_file in aoti_files.items():
+        for block in module.modules():
+            if block.__class__.__name__ == block_name:
+                weights = ZeroGPUWeights(block.state_dict())
+                block.forward = ZeroGPUCompiledModel(aoti_file, weights)

app.py CHANGED Viewed

@@ -5,11 +5,12 @@ import spaces
 import torch
 from diffusers import FluxPipeline
-from optimization import optimize_pipeline_
 pipeline = FluxPipeline.from_pretrained('black-forest-labs/FLUX.1-dev', torch_dtype=torch.bfloat16).to('cuda')
-optimize_pipeline_(pipeline, "prompt")
 @spaces.GPU

 import torch
 from diffusers import FluxPipeline
+from aoti import aoti_load
 pipeline = FluxPipeline.from_pretrained('black-forest-labs/FLUX.1-dev', torch_dtype=torch.bfloat16).to('cuda')
+pipeline.transformer.fuse_qkv_projections()
+aoti_load(pipeline.transformer, 'zerogpu-aoti/FLUX.1')
 @spaces.GPU

optimization.py DELETED Viewed

@@ -1,67 +0,0 @@
-"""
-"""
-from typing import Any
-from typing import Callable
-from typing import ParamSpec
-import spaces
-import torch
-from spaces.zero.torch.aoti import ZeroGPUCompiledModel
-from spaces.zero.torch.aoti import ZeroGPUWeights
-from fa3 import FlashFusedFluxAttnProcessor3_0
-P = ParamSpec('P')
-INDUCTOR_CONFIGS = {
-    'conv_1x1_as_mm': True,
-    'epilogue_fusion': False,
-    'coordinate_descent_tuning': True,
-    'coordinate_descent_check_all_directions': True,
-    'max_autotune': True,
-    'triton.cudagraphs': True,
-}
-def optimize_pipeline_(pipeline: Callable[P, Any], *args: P.args, **kwargs: P.kwargs):
-    blocks_A = pipeline.transformer.transformer_blocks
-    blocks_B = pipeline.transformer.single_transformer_blocks
-    @spaces.GPU(duration=1500)
-    def compile_transformer_block_AB():
-        with spaces.aoti_capture(blocks_A[0]) as call_A:
-            pipeline(*args, **kwargs)
-        with spaces.aoti_capture(blocks_B[0]) as call_B:
-            pipeline(*args, **kwargs)
-        exported_A = torch.export.export(
-            mod=blocks_A[0],
-            args=call_A.args,
-            kwargs=call_A.kwargs,
-        )
-        exported_B = torch.export.export(
-            mod=blocks_B[0],
-            args=call_B.args,
-            kwargs=call_B.kwargs,
-        )
-        return (
-            spaces.aoti_compile(exported_A, INDUCTOR_CONFIGS).archive_file,
-            spaces.aoti_compile(exported_B, INDUCTOR_CONFIGS).archive_file,
-        )
-    pipeline.transformer.fuse_qkv_projections()
-    pipeline.transformer.set_attn_processor(FlashFusedFluxAttnProcessor3_0())
-    archive_file_A, archive_file_B = compile_transformer_block_AB()
-    for blocks, archive_file in zip((blocks_A, blocks_B), (archive_file_A, archive_file_B)):
-        for block in blocks:
-            weights = ZeroGPUWeights(block.state_dict())
-            block.forward = ZeroGPUCompiledModel(archive_file, weights)