|
|
|
-------------------------- DeepSpeed Flops Profiler -------------------------- |
|
Profile Summary at step 2: |
|
Notations: |
|
data parallel size (dp_size), model parallel size(mp_size), |
|
number of parameters (params), number of multiply-accumulate operations(MACs), |
|
number of floating-point operations (flops), floating-point operations per second (FLOPS), |
|
fwd latency (forward propagation latency), bwd latency (backward propagation latency), |
|
step (weights update latency), iter latency (sum of fwd, bwd and step latency) |
|
|
|
world size: 32 |
|
data parallel size: 32 |
|
model parallel size: 1 |
|
batch size per GPU: 16 |
|
params per GPU: 9.21 B |
|
params of model = params per GPU * mp_size: 9.21 B |
|
fwd MACs per GPU: 24.91 TMACs |
|
fwd flops per GPU: 49.82 T |
|
fwd flops of model = fwd flops per GPU * mp_size: 49.82 T |
|
fwd latency: 246.3 ms |
|
fwd FLOPS per GPU = fwd flops per GPU / fwd latency: 202.25 TFLOPS |
|
bwd latency: 973.32 ms |
|
bwd FLOPS per GPU = 2 * fwd flops per GPU / bwd latency: 102.36 TFLOPS |
|
fwd+bwd FLOPS per GPU = 3 * fwd flops per GPU / (fwd+bwd latency): 122.54 TFLOPS |
|
step latency: 441.01 ms |
|
iter latency: 1.66 s |
|
FLOPS per GPU = 3 * fwd flops per GPU / iter latency: 89.99 TFLOPS |
|
samples/second: 308.32 |
|
|
|
----------------------------- Aggregated Profile per GPU ----------------------------- |
|
Top 1 modules in terms of params, MACs or fwd latency at different model depths: |
|
depth 0: |
|
params - {'DiT': '9.21 B'} |
|
MACs - {'DiT': '24.91 TMACs'} |
|
fwd latency - {'DiT': '246.07 ms'} |
|
depth 1: |
|
params - {'ModuleList': '9.13 B'} |
|
MACs - {'ModuleList': '24.81 TMACs'} |
|
fwd latency - {'ModuleList': '234.7 ms'} |
|
depth 2: |
|
params - {'DiTLayer': '9.13 B'} |
|
MACs - {'DiTLayer': '24.81 TMACs'} |
|
fwd latency - {'DiTLayer': '234.7 ms'} |
|
depth 3: |
|
params - {'GemmaMLP': '4.3 B'} |
|
MACs - {'GemmaMLP': '17.61 TMACs'} |
|
fwd latency - {'DiTSelfAttention': '96.91 ms'} |
|
|
|
------------------------------ Detailed Profile per GPU ------------------------------ |
|
Each module profile is listed after its name in the following order: |
|
params, percentage of total params, MACs, percentage of total MACs, fwd latency, percentage of total fwd latency, fwd FLOPS |
|
|
|
Note: 1. A module can have torch.nn.module or torch.nn.functional to compute logits (e.g. CrossEntropyLoss). They are not counted as submodules, thus not to be printed out. However they make up the difference between a parent's MACs (or latency) and the sum of its submodules'. |
|
2. Number of floating-point operations is a theoretical estimation, thus FLOPS computed using that could be larger than the maximum system throughput. |
|
3. The fwd latency listed in the top module's profile is directly captured at the module forward function in PyTorch, thus it's less than the fwd latency shown above which is captured in DeepSpeed. |
|
|
|
DiT( |
|
9.21 B = 100% Params, 24.91 TMACs = 100% MACs, 246.07 ms = 100% latency, 202.45 TFLOPS |
|
(layers): ModuleList( |
|
(0): DiTLayer( |
|
285.41 M = 3.1% Params, 775.38 GMACs = 3.11% MACs, 7.34 ms = 2.98% latency, 211.41 TFLOPS |
|
(input_layernorm): AdaLayerNormZero( |
|
100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 991.11 us = 0.4% latency, 3.25 TFLOPS |
|
(silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 39.82 us = 0.02% latency, 1.65 GFLOPS) |
|
(linear): Linear(100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 270.13 us = 0.11% latency, 11.92 TFLOPS, in_features=4096, out_features=24576, bias=True) |
|
(norm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 462.53 us = 0.19% latency, 0 FLOPS) |
|
) |
|
(self_attn): DiTSelfAttention( |
|
50.33 M = 0.55% Params, 223.34 GMACs = 0.9% MACs, 3.05 ms = 1.24% latency, 146.31 TFLOPS |
|
(q_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 463.96 us = 0.19% latency, 0 FLOPS) |
|
(k_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 236.27 us = 0.1% latency, 0 FLOPS) |
|
(q_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 361.44 us = 0.15% latency, 380.25 TFLOPS, in_features=4096, out_features=4096, bias=False) |
|
(k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 199.32 us = 0.08% latency, 172.39 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 195.5 us = 0.08% latency, 175.75 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(text_k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 157.59 us = 0.06% latency, 218.03 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(text_v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 153.78 us = 0.06% latency, 223.43 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(o_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 354.53 us = 0.14% latency, 387.67 TFLOPS, in_features=4096, out_features=4096, bias=False) |
|
) |
|
(post_attention_layernorm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 457.76 us = 0.19% latency, 0 FLOPS) |
|
(mlp): GemmaMLP( |
|
134.38 M = 1.46% Params, 550.43 GMACs = 2.21% MACs, 2.18 ms = 0.88% latency, 505.97 TFLOPS |
|
(gate_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 628.23 us = 0.26% latency, 584.1 TFLOPS, in_features=4096, out_features=10936, bias=False) |
|
(up_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 624.9 us = 0.25% latency, 587.22 TFLOPS, in_features=4096, out_features=10936, bias=False) |
|
(down_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 595.33 us = 0.24% latency, 616.38 TFLOPS, in_features=10936, out_features=4096, bias=False) |
|
(act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 116.11 us = 0.05% latency, 385.79 GFLOPS) |
|
) |
|
) |
|
(1): DiTLayer( |
|
285.41 M = 3.1% Params, 775.38 GMACs = 3.11% MACs, 7.2 ms = 2.93% latency, 215.36 TFLOPS |
|
(input_layernorm): AdaLayerNormZero( |
|
100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 948.19 us = 0.39% latency, 3.4 TFLOPS |
|
(silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 36.72 us = 0.01% latency, 1.78 GFLOPS) |
|
(linear): Linear(100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 238.9 us = 0.1% latency, 13.48 TFLOPS, in_features=4096, out_features=24576, bias=True) |
|
(norm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 461.1 us = 0.19% latency, 0 FLOPS) |
|
) |
|
(self_attn): DiTSelfAttention( |
|
50.33 M = 0.55% Params, 223.34 GMACs = 0.9% MACs, 3 ms = 1.22% latency, 148.78 TFLOPS |
|
(q_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 462.77 us = 0.19% latency, 0 FLOPS) |
|
(k_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 235.32 us = 0.1% latency, 0 FLOPS) |
|
(q_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 348.09 us = 0.14% latency, 394.84 TFLOPS, in_features=4096, out_features=4096, bias=False) |
|
(k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 197.17 us = 0.08% latency, 174.26 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 194.55 us = 0.08% latency, 176.61 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(text_k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 157.83 us = 0.06% latency, 217.7 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(text_v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 154.5 us = 0.06% latency, 222.4 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(o_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 343.8 us = 0.14% latency, 399.76 TFLOPS, in_features=4096, out_features=4096, bias=False) |
|
) |
|
(post_attention_layernorm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 457.05 us = 0.19% latency, 0 FLOPS) |
|
(mlp): GemmaMLP( |
|
134.38 M = 1.46% Params, 550.43 GMACs = 2.21% MACs, 2.14 ms = 0.87% latency, 514.26 TFLOPS |
|
(gate_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 627.04 us = 0.25% latency, 585.21 TFLOPS, in_features=4096, out_features=10936, bias=False) |
|
(up_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 621.8 us = 0.25% latency, 590.15 TFLOPS, in_features=4096, out_features=10936, bias=False) |
|
(down_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 587.7 us = 0.24% latency, 624.38 TFLOPS, in_features=10936, out_features=4096, bias=False) |
|
(act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 100.85 us = 0.04% latency, 444.16 GFLOPS) |
|
) |
|
) |
|
(2): DiTLayer( |
|
285.41 M = 3.1% Params, 775.38 GMACs = 3.11% MACs, 7.22 ms = 2.93% latency, 214.76 TFLOPS |
|
(input_layernorm): AdaLayerNormZero( |
|
100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 947.48 us = 0.39% latency, 3.4 TFLOPS |
|
(silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 37.43 us = 0.02% latency, 1.75 GFLOPS) |
|
(linear): Linear(100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 238.66 us = 0.1% latency, 13.5 TFLOPS, in_features=4096, out_features=24576, bias=True) |
|
(norm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 462.06 us = 0.19% latency, 0 FLOPS) |
|
) |
|
(self_attn): DiTSelfAttention( |
|
50.33 M = 0.55% Params, 223.34 GMACs = 0.9% MACs, 3.02 ms = 1.23% latency, 148.02 TFLOPS |
|
(q_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 466.11 us = 0.19% latency, 0 FLOPS) |
|
(k_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 234.6 us = 0.1% latency, 0 FLOPS) |
|
(q_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 349.04 us = 0.14% latency, 393.76 TFLOPS, in_features=4096, out_features=4096, bias=False) |
|
(k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 198.13 us = 0.08% latency, 173.42 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 194.79 us = 0.08% latency, 176.4 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(text_k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 159.74 us = 0.06% latency, 215.1 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(text_v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 154.5 us = 0.06% latency, 222.4 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(o_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 339.98 us = 0.14% latency, 404.25 TFLOPS, in_features=4096, out_features=4096, bias=False) |
|
) |
|
(post_attention_layernorm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 458.24 us = 0.19% latency, 0 FLOPS) |
|
(mlp): GemmaMLP( |
|
134.38 M = 1.46% Params, 550.43 GMACs = 2.21% MACs, 2.15 ms = 0.87% latency, 513.11 TFLOPS |
|
(gate_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 633.96 us = 0.26% latency, 578.83 TFLOPS, in_features=4096, out_features=10936, bias=False) |
|
(up_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 620.84 us = 0.25% latency, 591.05 TFLOPS, in_features=4096, out_features=10936, bias=False) |
|
(down_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 587.46 us = 0.24% latency, 624.64 TFLOPS, in_features=10936, out_features=4096, bias=False) |
|
(act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 100.61 us = 0.04% latency, 445.21 GFLOPS) |
|
) |
|
) |
|
(3): DiTLayer( |
|
285.41 M = 3.1% Params, 775.38 GMACs = 3.11% MACs, 7.25 ms = 2.94% latency, 214.03 TFLOPS |
|
(input_layernorm): AdaLayerNormZero( |
|
100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 948.19 us = 0.39% latency, 3.4 TFLOPS |
|
(silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 38.15 us = 0.02% latency, 1.72 GFLOPS) |
|
(linear): Linear(100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 239.13 us = 0.1% latency, 13.47 TFLOPS, in_features=4096, out_features=24576, bias=True) |
|
(norm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 462.06 us = 0.19% latency, 0 FLOPS) |
|
) |
|
(self_attn): DiTSelfAttention( |
|
50.33 M = 0.55% Params, 223.34 GMACs = 0.9% MACs, 3.04 ms = 1.24% latency, 146.93 TFLOPS |
|
(q_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 464.68 us = 0.19% latency, 0 FLOPS) |
|
(k_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 238.9 us = 0.1% latency, 0 FLOPS) |
|
(q_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 349.76 us = 0.14% latency, 392.95 TFLOPS, in_features=4096, out_features=4096, bias=False) |
|
(k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 208.14 us = 0.08% latency, 165.08 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 195.5 us = 0.08% latency, 175.75 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(text_k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 159.98 us = 0.07% latency, 214.78 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(text_v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 156.16 us = 0.06% latency, 220.02 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(o_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 346.42 us = 0.14% latency, 396.74 TFLOPS, in_features=4096, out_features=4096, bias=False) |
|
) |
|
(post_attention_layernorm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 458.48 us = 0.19% latency, 0 FLOPS) |
|
(mlp): GemmaMLP( |
|
134.38 M = 1.46% Params, 550.43 GMACs = 2.21% MACs, 2.15 ms = 0.87% latency, 512.94 TFLOPS |
|
(gate_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 627.52 us = 0.26% latency, 584.77 TFLOPS, in_features=4096, out_features=10936, bias=False) |
|
(up_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 623.7 us = 0.25% latency, 588.34 TFLOPS, in_features=4096, out_features=10936, bias=False) |
|
(down_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 590.09 us = 0.24% latency, 621.86 TFLOPS, in_features=10936, out_features=4096, bias=False) |
|
(act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 101.57 us = 0.04% latency, 441.03 GFLOPS) |
|
) |
|
) |
|
(4): DiTLayer( |
|
285.41 M = 3.1% Params, 775.38 GMACs = 3.11% MACs, 7.24 ms = 2.94% latency, 214.17 TFLOPS |
|
(input_layernorm): AdaLayerNormZero( |
|
100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 959.87 us = 0.39% latency, 3.36 TFLOPS |
|
(silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 41.72 us = 0.02% latency, 1.57 GFLOPS) |
|
(linear): Linear(100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 243.66 us = 0.1% latency, 13.22 TFLOPS, in_features=4096, out_features=24576, bias=True) |
|
(norm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 463.72 us = 0.19% latency, 0 FLOPS) |
|
) |
|
(self_attn): DiTSelfAttention( |
|
50.33 M = 0.55% Params, 223.34 GMACs = 0.9% MACs, 3.03 ms = 1.23% latency, 147.57 TFLOPS |
|
(q_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 465.63 us = 0.19% latency, 0 FLOPS) |
|
(k_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 237.7 us = 0.1% latency, 0 FLOPS) |
|
(q_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 349.76 us = 0.14% latency, 392.95 TFLOPS, in_features=4096, out_features=4096, bias=False) |
|
(k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 198.6 us = 0.08% latency, 173.01 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 196.22 us = 0.08% latency, 175.11 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(text_k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 168.56 us = 0.07% latency, 203.84 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(text_v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 156.16 us = 0.06% latency, 220.02 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(o_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 334.26 us = 0.14% latency, 411.17 TFLOPS, in_features=4096, out_features=4096, bias=False) |
|
) |
|
(post_attention_layernorm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 459.19 us = 0.19% latency, 0 FLOPS) |
|
(mlp): GemmaMLP( |
|
134.38 M = 1.46% Params, 550.43 GMACs = 2.21% MACs, 2.14 ms = 0.87% latency, 513.91 TFLOPS |
|
(gate_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 625.13 us = 0.25% latency, 587 TFLOPS, in_features=4096, out_features=10936, bias=False) |
|
(up_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 623.94 us = 0.25% latency, 588.12 TFLOPS, in_features=4096, out_features=10936, bias=False) |
|
(down_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 586.03 us = 0.24% latency, 626.16 TFLOPS, in_features=10936, out_features=4096, bias=False) |
|
(act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 101.33 us = 0.04% latency, 442.07 GFLOPS) |
|
) |
|
) |
|
(5): DiTLayer( |
|
285.41 M = 3.1% Params, 775.38 GMACs = 3.11% MACs, 7.24 ms = 2.94% latency, 214.34 TFLOPS |
|
(input_layernorm): AdaLayerNormZero( |
|
100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 960.59 us = 0.39% latency, 3.35 TFLOPS |
|
(silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 39.1 us = 0.02% latency, 1.68 GFLOPS) |
|
(linear): Linear(100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 246.05 us = 0.1% latency, 13.09 TFLOPS, in_features=4096, out_features=24576, bias=True) |
|
(norm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 463.72 us = 0.19% latency, 0 FLOPS) |
|
) |
|
(self_attn): DiTSelfAttention( |
|
50.33 M = 0.55% Params, 223.34 GMACs = 0.9% MACs, 3.01 ms = 1.22% latency, 148.22 TFLOPS |
|
(q_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 463.01 us = 0.19% latency, 0 FLOPS) |
|
(k_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 236.75 us = 0.1% latency, 0 FLOPS) |
|
(q_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 349.52 us = 0.14% latency, 393.22 TFLOPS, in_features=4096, out_features=4096, bias=False) |
|
(k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 197.65 us = 0.08% latency, 173.84 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 195.5 us = 0.08% latency, 175.75 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(text_k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 158.55 us = 0.06% latency, 216.71 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(text_v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 155.45 us = 0.06% latency, 221.04 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(o_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 333.79 us = 0.14% latency, 411.76 TFLOPS, in_features=4096, out_features=4096, bias=False) |
|
) |
|
(post_attention_layernorm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 458 us = 0.19% latency, 0 FLOPS) |
|
(mlp): GemmaMLP( |
|
134.38 M = 1.46% Params, 550.43 GMACs = 2.21% MACs, 2.15 ms = 0.87% latency, 512.26 TFLOPS |
|
(gate_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 626.8 us = 0.25% latency, 585.43 TFLOPS, in_features=4096, out_features=10936, bias=False) |
|
(up_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 623.7 us = 0.25% latency, 588.34 TFLOPS, in_features=4096, out_features=10936, bias=False) |
|
(down_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 586.99 us = 0.24% latency, 625.14 TFLOPS, in_features=10936, out_features=4096, bias=False) |
|
(act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 103 us = 0.04% latency, 434.91 GFLOPS) |
|
) |
|
) |
|
(6): DiTLayer( |
|
285.41 M = 3.1% Params, 775.38 GMACs = 3.11% MACs, 7.25 ms = 2.95% latency, 213.89 TFLOPS |
|
(input_layernorm): AdaLayerNormZero( |
|
100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 994.21 us = 0.4% latency, 3.24 TFLOPS |
|
(silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 40.77 us = 0.02% latency, 1.61 GFLOPS) |
|
(linear): Linear(100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 237.46 us = 0.1% latency, 13.57 TFLOPS, in_features=4096, out_features=24576, bias=True) |
|
(norm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 463.01 us = 0.19% latency, 0 FLOPS) |
|
) |
|
(self_attn): DiTSelfAttention( |
|
50.33 M = 0.55% Params, 223.34 GMACs = 0.9% MACs, 3.01 ms = 1.22% latency, 148.35 TFLOPS |
|
(q_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 464.2 us = 0.19% latency, 0 FLOPS) |
|
(k_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 235.08 us = 0.1% latency, 0 FLOPS) |
|
(q_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 347.85 us = 0.14% latency, 395.11 TFLOPS, in_features=4096, out_features=4096, bias=False) |
|
(k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 196.7 us = 0.08% latency, 174.69 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 193.6 us = 0.08% latency, 177.48 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(text_k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 156.88 us = 0.06% latency, 219.02 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(text_v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 154.26 us = 0.06% latency, 222.74 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(o_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 340.22 us = 0.14% latency, 403.97 TFLOPS, in_features=4096, out_features=4096, bias=False) |
|
) |
|
(post_attention_layernorm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 458.72 us = 0.19% latency, 0 FLOPS) |
|
(mlp): GemmaMLP( |
|
134.38 M = 1.46% Params, 550.43 GMACs = 2.21% MACs, 2.14 ms = 0.87% latency, 515.29 TFLOPS |
|
(gate_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 626.33 us = 0.25% latency, 585.88 TFLOPS, in_features=4096, out_features=10936, bias=False) |
|
(up_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 622.27 us = 0.25% latency, 589.7 TFLOPS, in_features=4096, out_features=10936, bias=False) |
|
(down_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 583.17 us = 0.24% latency, 629.23 TFLOPS, in_features=10936, out_features=4096, bias=False) |
|
(act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 100.37 us = 0.04% latency, 446.27 GFLOPS) |
|
) |
|
) |
|
(7): DiTLayer( |
|
285.41 M = 3.1% Params, 775.38 GMACs = 3.11% MACs, 7.2 ms = 2.93% latency, 215.45 TFLOPS |
|
(input_layernorm): AdaLayerNormZero( |
|
100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 946.28 us = 0.38% latency, 3.4 TFLOPS |
|
(silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 36 us = 0.01% latency, 1.82 GFLOPS) |
|
(linear): Linear(100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 242.71 us = 0.1% latency, 13.27 TFLOPS, in_features=4096, out_features=24576, bias=True) |
|
(norm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 461.1 us = 0.19% latency, 0 FLOPS) |
|
) |
|
(self_attn): DiTSelfAttention( |
|
50.33 M = 0.55% Params, 223.34 GMACs = 0.9% MACs, 3.01 ms = 1.22% latency, 148.48 TFLOPS |
|
(q_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 464.68 us = 0.19% latency, 0 FLOPS) |
|
(k_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 235.08 us = 0.1% latency, 0 FLOPS) |
|
(q_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 350 us = 0.14% latency, 392.68 TFLOPS, in_features=4096, out_features=4096, bias=False) |
|
(k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 198.84 us = 0.08% latency, 172.8 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 194.79 us = 0.08% latency, 176.4 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(text_k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 155.45 us = 0.06% latency, 221.04 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(text_v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 153.54 us = 0.06% latency, 223.78 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(o_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 335.45 us = 0.14% latency, 409.71 TFLOPS, in_features=4096, out_features=4096, bias=False) |
|
) |
|
(post_attention_layernorm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 456.57 us = 0.19% latency, 0 FLOPS) |
|
(mlp): GemmaMLP( |
|
134.38 M = 1.46% Params, 550.43 GMACs = 2.21% MACs, 2.14 ms = 0.87% latency, 515.12 TFLOPS |
|
(gate_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 627.28 us = 0.25% latency, 584.99 TFLOPS, in_features=4096, out_features=10936, bias=False) |
|
(up_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 622.27 us = 0.25% latency, 589.7 TFLOPS, in_features=4096, out_features=10936, bias=False) |
|
(down_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 583.17 us = 0.24% latency, 629.23 TFLOPS, in_features=10936, out_features=4096, bias=False) |
|
(act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 100.14 us = 0.04% latency, 447.33 GFLOPS) |
|
) |
|
) |
|
(8): DiTLayer( |
|
285.41 M = 3.1% Params, 775.38 GMACs = 3.11% MACs, 7.27 ms = 2.95% latency, 213.4 TFLOPS |
|
(input_layernorm): AdaLayerNormZero( |
|
100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 960.35 us = 0.39% latency, 3.35 TFLOPS |
|
(silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 43.39 us = 0.02% latency, 1.51 GFLOPS) |
|
(linear): Linear(100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 238.18 us = 0.1% latency, 13.52 TFLOPS, in_features=4096, out_features=24576, bias=True) |
|
(norm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 463.72 us = 0.19% latency, 0 FLOPS) |
|
) |
|
(self_attn): DiTSelfAttention( |
|
50.33 M = 0.55% Params, 223.34 GMACs = 0.9% MACs, 3.01 ms = 1.23% latency, 148.17 TFLOPS |
|
(q_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 462.53 us = 0.19% latency, 0 FLOPS) |
|
(k_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 236.27 us = 0.1% latency, 0 FLOPS) |
|
(q_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 348.33 us = 0.14% latency, 394.57 TFLOPS, in_features=4096, out_features=4096, bias=False) |
|
(k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 198.36 us = 0.08% latency, 173.22 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 194.55 us = 0.08% latency, 176.61 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(text_k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 156.4 us = 0.06% latency, 219.69 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(text_v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 154.97 us = 0.06% latency, 221.72 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(o_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 341.89 us = 0.14% latency, 401.99 TFLOPS, in_features=4096, out_features=4096, bias=False) |
|
) |
|
(post_attention_layernorm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 458 us = 0.19% latency, 0 FLOPS) |
|
(mlp): GemmaMLP( |
|
134.38 M = 1.46% Params, 550.43 GMACs = 2.21% MACs, 2.18 ms = 0.89% latency, 504.81 TFLOPS |
|
(gate_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 627.52 us = 0.26% latency, 584.77 TFLOPS, in_features=4096, out_features=10936, bias=False) |
|
(up_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 624.18 us = 0.25% latency, 587.89 TFLOPS, in_features=4096, out_features=10936, bias=False) |
|
(down_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 587.7 us = 0.24% latency, 624.38 TFLOPS, in_features=10936, out_features=4096, bias=False) |
|
(act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 103.24 us = 0.04% latency, 433.9 GFLOPS) |
|
) |
|
) |
|
(9): DiTLayer( |
|
285.41 M = 3.1% Params, 775.38 GMACs = 3.11% MACs, 7.27 ms = 2.96% latency, 213.18 TFLOPS |
|
(input_layernorm): AdaLayerNormZero( |
|
100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 957.73 us = 0.39% latency, 3.36 TFLOPS |
|
(silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 38.86 us = 0.02% latency, 1.69 GFLOPS) |
|
(linear): Linear(100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 240.33 us = 0.1% latency, 13.4 TFLOPS, in_features=4096, out_features=24576, bias=True) |
|
(norm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 463.72 us = 0.19% latency, 0 FLOPS) |
|
) |
|
(self_attn): DiTSelfAttention( |
|
50.33 M = 0.55% Params, 223.34 GMACs = 0.9% MACs, 3.03 ms = 1.23% latency, 147.24 TFLOPS |
|
(q_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 464.92 us = 0.19% latency, 0 FLOPS) |
|
(k_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 236.03 us = 0.1% latency, 0 FLOPS) |
|
(q_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 350 us = 0.14% latency, 392.68 TFLOPS, in_features=4096, out_features=4096, bias=False) |
|
(k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 199.08 us = 0.08% latency, 172.59 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 194.79 us = 0.08% latency, 176.4 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(text_k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 157.59 us = 0.06% latency, 218.03 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(text_v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 153.78 us = 0.06% latency, 223.43 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(o_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 347.85 us = 0.14% latency, 395.11 TFLOPS, in_features=4096, out_features=4096, bias=False) |
|
) |
|
(post_attention_layernorm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 459.19 us = 0.19% latency, 0 FLOPS) |
|
(mlp): GemmaMLP( |
|
134.38 M = 1.46% Params, 550.43 GMACs = 2.21% MACs, 2.16 ms = 0.88% latency, 510.73 TFLOPS |
|
(gate_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 635.15 us = 0.26% latency, 577.74 TFLOPS, in_features=4096, out_features=10936, bias=False) |
|
(up_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 623.46 us = 0.25% latency, 588.57 TFLOPS, in_features=4096, out_features=10936, bias=False) |
|
(down_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 587.46 us = 0.24% latency, 624.64 TFLOPS, in_features=10936, out_features=4096, bias=False) |
|
(act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 102.76 us = 0.04% latency, 435.91 GFLOPS) |
|
) |
|
) |
|
(10): DiTLayer( |
|
285.41 M = 3.1% Params, 775.38 GMACs = 3.11% MACs, 7.27 ms = 2.96% latency, 213.26 TFLOPS |
|
(input_layernorm): AdaLayerNormZero( |
|
100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 949.38 us = 0.39% latency, 3.39 TFLOPS |
|
(silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 36.72 us = 0.01% latency, 1.78 GFLOPS) |
|
(linear): Linear(100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 241.28 us = 0.1% latency, 13.35 TFLOPS, in_features=4096, out_features=24576, bias=True) |
|
(norm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 461.58 us = 0.19% latency, 0 FLOPS) |
|
) |
|
(self_attn): DiTSelfAttention( |
|
50.33 M = 0.55% Params, 223.34 GMACs = 0.9% MACs, 3.03 ms = 1.23% latency, 147.18 TFLOPS |
|
(q_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 466.11 us = 0.19% latency, 0 FLOPS) |
|
(k_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 237.46 us = 0.1% latency, 0 FLOPS) |
|
(q_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 349.28 us = 0.14% latency, 393.49 TFLOPS, in_features=4096, out_features=4096, bias=False) |
|
(k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 198.13 us = 0.08% latency, 173.42 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 194.79 us = 0.08% latency, 176.4 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(text_k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 158.31 us = 0.06% latency, 217.04 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(text_v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 154.97 us = 0.06% latency, 221.72 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(o_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 347.14 us = 0.14% latency, 395.92 TFLOPS, in_features=4096, out_features=4096, bias=False) |
|
) |
|
(post_attention_layernorm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 462.06 us = 0.19% latency, 0 FLOPS) |
|
(mlp): GemmaMLP( |
|
134.38 M = 1.46% Params, 550.43 GMACs = 2.21% MACs, 2.17 ms = 0.88% latency, 508.48 TFLOPS |
|
(gate_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 629.19 us = 0.26% latency, 583.22 TFLOPS, in_features=4096, out_features=10936, bias=False) |
|
(up_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 623.94 us = 0.25% latency, 588.12 TFLOPS, in_features=4096, out_features=10936, bias=False) |
|
(down_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 594.38 us = 0.24% latency, 617.37 TFLOPS, in_features=10936, out_features=4096, bias=False) |
|
(act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 102.52 us = 0.04% latency, 436.93 GFLOPS) |
|
) |
|
) |
|
(11): DiTLayer( |
|
285.41 M = 3.1% Params, 775.38 GMACs = 3.11% MACs, 7.31 ms = 2.97% latency, 212.09 TFLOPS |
|
(input_layernorm): AdaLayerNormZero( |
|
100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 954.63 us = 0.39% latency, 3.37 TFLOPS |
|
(silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 38.15 us = 0.02% latency, 1.72 GFLOPS) |
|
(linear): Linear(100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 239.85 us = 0.1% latency, 13.43 TFLOPS, in_features=4096, out_features=24576, bias=True) |
|
(norm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 462.06 us = 0.19% latency, 0 FLOPS) |
|
) |
|
(self_attn): DiTSelfAttention( |
|
50.33 M = 0.55% Params, 223.34 GMACs = 0.9% MACs, 3.09 ms = 1.26% latency, 144.35 TFLOPS |
|
(q_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 467.54 us = 0.19% latency, 0 FLOPS) |
|
(k_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 236.75 us = 0.1% latency, 0 FLOPS) |
|
(q_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 354.05 us = 0.14% latency, 388.19 TFLOPS, in_features=4096, out_features=4096, bias=False) |
|
(k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 202.89 us = 0.08% latency, 169.35 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 197.89 us = 0.08% latency, 173.63 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(text_k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 160.46 us = 0.07% latency, 214.14 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(text_v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 160.46 us = 0.07% latency, 214.14 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(o_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 340.7 us = 0.14% latency, 403.4 TFLOPS, in_features=4096, out_features=4096, bias=False) |
|
) |
|
(post_attention_layernorm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 458.24 us = 0.19% latency, 0 FLOPS) |
|
(mlp): GemmaMLP( |
|
134.38 M = 1.46% Params, 550.43 GMACs = 2.21% MACs, 2.15 ms = 0.87% latency, 512.88 TFLOPS |
|
(gate_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 626.09 us = 0.25% latency, 586.1 TFLOPS, in_features=4096, out_features=10936, bias=False) |
|
(up_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 623.23 us = 0.25% latency, 588.79 TFLOPS, in_features=4096, out_features=10936, bias=False) |
|
(down_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 588.66 us = 0.24% latency, 623.37 TFLOPS, in_features=10936, out_features=4096, bias=False) |
|
(act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 101.33 us = 0.04% latency, 442.07 GFLOPS) |
|
) |
|
) |
|
(12): DiTLayer( |
|
285.41 M = 3.1% Params, 775.38 GMACs = 3.11% MACs, 7.21 ms = 2.93% latency, 215.14 TFLOPS |
|
(input_layernorm): AdaLayerNormZero( |
|
100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 944.38 us = 0.38% latency, 3.41 TFLOPS |
|
(silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 36.95 us = 0.02% latency, 1.77 GFLOPS) |
|
(linear): Linear(100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 235.8 us = 0.1% latency, 13.66 TFLOPS, in_features=4096, out_features=24576, bias=True) |
|
(norm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 460.86 us = 0.19% latency, 0 FLOPS) |
|
) |
|
(self_attn): DiTSelfAttention( |
|
50.33 M = 0.55% Params, 223.34 GMACs = 0.9% MACs, 3.01 ms = 1.22% latency, 148.27 TFLOPS |
|
(q_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 464.44 us = 0.19% latency, 0 FLOPS) |
|
(k_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 238.42 us = 0.1% latency, 0 FLOPS) |
|
(q_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 347.38 us = 0.14% latency, 395.65 TFLOPS, in_features=4096, out_features=4096, bias=False) |
|
(k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 198.13 us = 0.08% latency, 173.42 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 195.98 us = 0.08% latency, 175.32 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(text_k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 158.07 us = 0.06% latency, 217.37 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(text_v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 155.69 us = 0.06% latency, 220.7 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(o_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 337.84 us = 0.14% latency, 406.82 TFLOPS, in_features=4096, out_features=4096, bias=False) |
|
) |
|
(post_attention_layernorm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 458.48 us = 0.19% latency, 0 FLOPS) |
|
(mlp): GemmaMLP( |
|
134.38 M = 1.46% Params, 550.43 GMACs = 2.21% MACs, 2.14 ms = 0.87% latency, 514.26 TFLOPS |
|
(gate_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 624.9 us = 0.25% latency, 587.22 TFLOPS, in_features=4096, out_features=10936, bias=False) |
|
(up_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 622.75 us = 0.25% latency, 589.24 TFLOPS, in_features=4096, out_features=10936, bias=False) |
|
(down_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 587.22 us = 0.24% latency, 624.89 TFLOPS, in_features=10936, out_features=4096, bias=False) |
|
(act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 101.8 us = 0.04% latency, 440 GFLOPS) |
|
) |
|
) |
|
(13): DiTLayer( |
|
285.41 M = 3.1% Params, 775.38 GMACs = 3.11% MACs, 7.24 ms = 2.94% latency, 214.3 TFLOPS |
|
(input_layernorm): AdaLayerNormZero( |
|
100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 960.11 us = 0.39% latency, 3.36 TFLOPS |
|
(silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 39.34 us = 0.02% latency, 1.67 GFLOPS) |
|
(linear): Linear(100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 246.05 us = 0.1% latency, 13.09 TFLOPS, in_features=4096, out_features=24576, bias=True) |
|
(norm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 464.2 us = 0.19% latency, 0 FLOPS) |
|
) |
|
(self_attn): DiTSelfAttention( |
|
50.33 M = 0.55% Params, 223.34 GMACs = 0.9% MACs, 3.03 ms = 1.23% latency, 147.59 TFLOPS |
|
(q_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 464.44 us = 0.19% latency, 0 FLOPS) |
|
(k_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 236.27 us = 0.1% latency, 0 FLOPS) |
|
(q_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 348.09 us = 0.14% latency, 394.84 TFLOPS, in_features=4096, out_features=4096, bias=False) |
|
(k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 199.32 us = 0.08% latency, 172.39 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 195.26 us = 0.08% latency, 175.96 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(text_k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 168.56 us = 0.07% latency, 203.84 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(text_v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 155.69 us = 0.06% latency, 220.7 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(o_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 338.08 us = 0.14% latency, 406.53 TFLOPS, in_features=4096, out_features=4096, bias=False) |
|
) |
|
(post_attention_layernorm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 459.19 us = 0.19% latency, 0 FLOPS) |
|
(mlp): GemmaMLP( |
|
134.38 M = 1.46% Params, 550.43 GMACs = 2.21% MACs, 2.14 ms = 0.87% latency, 514.6 TFLOPS |
|
(gate_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 626.33 us = 0.25% latency, 585.88 TFLOPS, in_features=4096, out_features=10936, bias=False) |
|
(up_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 621.8 us = 0.25% latency, 590.15 TFLOPS, in_features=4096, out_features=10936, bias=False) |
|
(down_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 587.94 us = 0.24% latency, 624.13 TFLOPS, in_features=10936, out_features=4096, bias=False) |
|
(act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 101.57 us = 0.04% latency, 441.03 GFLOPS) |
|
) |
|
) |
|
(14): DiTLayer( |
|
285.41 M = 3.1% Params, 775.38 GMACs = 3.11% MACs, 7.21 ms = 2.93% latency, 214.95 TFLOPS |
|
(input_layernorm): AdaLayerNormZero( |
|
100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 953.2 us = 0.39% latency, 3.38 TFLOPS |
|
(silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 39.58 us = 0.02% latency, 1.66 GFLOPS) |
|
(linear): Linear(100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 242.23 us = 0.1% latency, 13.3 TFLOPS, in_features=4096, out_features=24576, bias=True) |
|
(norm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 461.34 us = 0.19% latency, 0 FLOPS) |
|
) |
|
(self_attn): DiTSelfAttention( |
|
50.33 M = 0.55% Params, 223.34 GMACs = 0.9% MACs, 3.01 ms = 1.22% latency, 148.21 TFLOPS |
|
(q_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 463.96 us = 0.19% latency, 0 FLOPS) |
|
(k_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 236.03 us = 0.1% latency, 0 FLOPS) |
|
(q_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 348.81 us = 0.14% latency, 394.03 TFLOPS, in_features=4096, out_features=4096, bias=False) |
|
(k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 197.65 us = 0.08% latency, 173.84 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 193.83 us = 0.08% latency, 177.26 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(text_k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 156.4 us = 0.06% latency, 219.69 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(text_v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 154.26 us = 0.06% latency, 222.74 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(o_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 339.98 us = 0.14% latency, 404.25 TFLOPS, in_features=4096, out_features=4096, bias=False) |
|
) |
|
(post_attention_layernorm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 458.48 us = 0.19% latency, 0 FLOPS) |
|
(mlp): GemmaMLP( |
|
134.38 M = 1.46% Params, 550.43 GMACs = 2.21% MACs, 2.14 ms = 0.87% latency, 515.29 TFLOPS |
|
(gate_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 624.66 us = 0.25% latency, 587.44 TFLOPS, in_features=4096, out_features=10936, bias=False) |
|
(up_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 621.56 us = 0.25% latency, 590.37 TFLOPS, in_features=4096, out_features=10936, bias=False) |
|
(down_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 587.7 us = 0.24% latency, 624.38 TFLOPS, in_features=10936, out_features=4096, bias=False) |
|
(act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 100.61 us = 0.04% latency, 445.21 GFLOPS) |
|
) |
|
) |
|
(15): DiTLayer( |
|
285.41 M = 3.1% Params, 775.38 GMACs = 3.11% MACs, 7.23 ms = 2.94% latency, 214.39 TFLOPS |
|
(input_layernorm): AdaLayerNormZero( |
|
100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 954.15 us = 0.39% latency, 3.38 TFLOPS |
|
(silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 40.05 us = 0.02% latency, 1.64 GFLOPS) |
|
(linear): Linear(100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 240.33 us = 0.1% latency, 13.4 TFLOPS, in_features=4096, out_features=24576, bias=True) |
|
(norm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 464.92 us = 0.19% latency, 0 FLOPS) |
|
) |
|
(self_attn): DiTSelfAttention( |
|
50.33 M = 0.55% Params, 223.34 GMACs = 0.9% MACs, 3.02 ms = 1.23% latency, 147.86 TFLOPS |
|
(q_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 463.96 us = 0.19% latency, 0 FLOPS) |
|
(k_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 236.03 us = 0.1% latency, 0 FLOPS) |
|
(q_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 350.48 us = 0.14% latency, 392.15 TFLOPS, in_features=4096, out_features=4096, bias=False) |
|
(k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 198.84 us = 0.08% latency, 172.8 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 195.5 us = 0.08% latency, 175.75 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(text_k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 158.79 us = 0.06% latency, 216.39 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(text_v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 154.26 us = 0.06% latency, 222.74 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(o_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 340.7 us = 0.14% latency, 403.4 TFLOPS, in_features=4096, out_features=4096, bias=False) |
|
) |
|
(post_attention_layernorm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 458.24 us = 0.19% latency, 0 FLOPS) |
|
(mlp): GemmaMLP( |
|
134.38 M = 1.46% Params, 550.43 GMACs = 2.21% MACs, 2.14 ms = 0.87% latency, 513.74 TFLOPS |
|
(gate_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 628.47 us = 0.26% latency, 583.88 TFLOPS, in_features=4096, out_features=10936, bias=False) |
|
(up_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 624.9 us = 0.25% latency, 587.22 TFLOPS, in_features=4096, out_features=10936, bias=False) |
|
(down_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 586.03 us = 0.24% latency, 626.16 TFLOPS, in_features=10936, out_features=4096, bias=False) |
|
(act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 101.09 us = 0.04% latency, 443.11 GFLOPS) |
|
) |
|
) |
|
(16): DiTLayer( |
|
285.41 M = 3.1% Params, 775.38 GMACs = 3.11% MACs, 7.28 ms = 2.96% latency, 212.96 TFLOPS |
|
(input_layernorm): AdaLayerNormZero( |
|
100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 967.03 us = 0.39% latency, 3.33 TFLOPS |
|
(silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 48.16 us = 0.02% latency, 1.36 GFLOPS) |
|
(linear): Linear(100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 244.86 us = 0.1% latency, 13.16 TFLOPS, in_features=4096, out_features=24576, bias=True) |
|
(norm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 463.96 us = 0.19% latency, 0 FLOPS) |
|
) |
|
(self_attn): DiTSelfAttention( |
|
50.33 M = 0.55% Params, 223.34 GMACs = 0.9% MACs, 3.04 ms = 1.23% latency, 147.06 TFLOPS |
|
(q_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 465.39 us = 0.19% latency, 0 FLOPS) |
|
(k_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 236.51 us = 0.1% latency, 0 FLOPS) |
|
(q_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 351.67 us = 0.14% latency, 390.82 TFLOPS, in_features=4096, out_features=4096, bias=False) |
|
(k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 197.65 us = 0.08% latency, 173.84 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 195.5 us = 0.08% latency, 175.75 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(text_k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 158.55 us = 0.06% latency, 216.71 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(text_v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 154.26 us = 0.06% latency, 222.74 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(o_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 344.28 us = 0.14% latency, 399.21 TFLOPS, in_features=4096, out_features=4096, bias=False) |
|
) |
|
(post_attention_layernorm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 457.53 us = 0.19% latency, 0 FLOPS) |
|
(mlp): GemmaMLP( |
|
134.38 M = 1.46% Params, 550.43 GMACs = 2.21% MACs, 2.16 ms = 0.88% latency, 509.04 TFLOPS |
|
(gate_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 631.81 us = 0.26% latency, 580.79 TFLOPS, in_features=4096, out_features=10936, bias=False) |
|
(up_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 625.85 us = 0.25% latency, 586.33 TFLOPS, in_features=4096, out_features=10936, bias=False) |
|
(down_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 596.76 us = 0.24% latency, 614.9 TFLOPS, in_features=10936, out_features=4096, bias=False) |
|
(act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 101.33 us = 0.04% latency, 442.07 GFLOPS) |
|
) |
|
) |
|
(17): DiTLayer( |
|
285.41 M = 3.1% Params, 775.38 GMACs = 3.11% MACs, 7.22 ms = 2.94% latency, 214.68 TFLOPS |
|
(input_layernorm): AdaLayerNormZero( |
|
100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 951.29 us = 0.39% latency, 3.39 TFLOPS |
|
(silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 38.62 us = 0.02% latency, 1.7 GFLOPS) |
|
(linear): Linear(100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 240.8 us = 0.1% latency, 13.38 TFLOPS, in_features=4096, out_features=24576, bias=True) |
|
(norm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 461.58 us = 0.19% latency, 0 FLOPS) |
|
) |
|
(self_attn): DiTSelfAttention( |
|
50.33 M = 0.55% Params, 223.34 GMACs = 0.9% MACs, 3.01 ms = 1.22% latency, 148.22 TFLOPS |
|
(q_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 465.39 us = 0.19% latency, 0 FLOPS) |
|
(k_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 235.32 us = 0.1% latency, 0 FLOPS) |
|
(q_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 348.09 us = 0.14% latency, 394.84 TFLOPS, in_features=4096, out_features=4096, bias=False) |
|
(k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 196.93 us = 0.08% latency, 174.47 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 194.55 us = 0.08% latency, 176.61 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(text_k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 156.4 us = 0.06% latency, 219.69 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(text_v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 155.93 us = 0.06% latency, 220.36 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(o_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 342.61 us = 0.14% latency, 401.16 TFLOPS, in_features=4096, out_features=4096, bias=False) |
|
) |
|
(post_attention_layernorm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 458.24 us = 0.19% latency, 0 FLOPS) |
|
(mlp): GemmaMLP( |
|
134.38 M = 1.46% Params, 550.43 GMACs = 2.21% MACs, 2.15 ms = 0.87% latency, 513 TFLOPS |
|
(gate_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 629.43 us = 0.26% latency, 582.99 TFLOPS, in_features=4096, out_features=10936, bias=False) |
|
(up_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 625.37 us = 0.25% latency, 586.77 TFLOPS, in_features=4096, out_features=10936, bias=False) |
|
(down_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 585.08 us = 0.24% latency, 627.18 TFLOPS, in_features=10936, out_features=4096, bias=False) |
|
(act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 101.09 us = 0.04% latency, 443.11 GFLOPS) |
|
) |
|
) |
|
(18): DiTLayer( |
|
285.41 M = 3.1% Params, 775.38 GMACs = 3.11% MACs, 7.2 ms = 2.93% latency, 215.25 TFLOPS |
|
(input_layernorm): AdaLayerNormZero( |
|
100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 938.89 us = 0.38% latency, 3.43 TFLOPS |
|
(silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 37.19 us = 0.02% latency, 1.76 GFLOPS) |
|
(linear): Linear(100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 234.6 us = 0.1% latency, 13.73 TFLOPS, in_features=4096, out_features=24576, bias=True) |
|
(norm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 461.1 us = 0.19% latency, 0 FLOPS) |
|
) |
|
(self_attn): DiTSelfAttention( |
|
50.33 M = 0.55% Params, 223.34 GMACs = 0.9% MACs, 3.01 ms = 1.23% latency, 148.16 TFLOPS |
|
(q_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 465.15 us = 0.19% latency, 0 FLOPS) |
|
(k_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 236.27 us = 0.1% latency, 0 FLOPS) |
|
(q_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 350.71 us = 0.14% latency, 391.88 TFLOPS, in_features=4096, out_features=4096, bias=False) |
|
(k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 197.17 us = 0.08% latency, 174.26 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 195.03 us = 0.08% latency, 176.18 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(text_k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 155.93 us = 0.06% latency, 220.36 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(text_v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 154.26 us = 0.06% latency, 222.74 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(o_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 334.02 us = 0.14% latency, 411.46 TFLOPS, in_features=4096, out_features=4096, bias=False) |
|
) |
|
(post_attention_layernorm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 460.15 us = 0.19% latency, 0 FLOPS) |
|
(mlp): GemmaMLP( |
|
134.38 M = 1.46% Params, 550.43 GMACs = 2.21% MACs, 2.14 ms = 0.87% latency, 514.71 TFLOPS |
|
(gate_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 627.28 us = 0.25% latency, 584.99 TFLOPS, in_features=4096, out_features=10936, bias=False) |
|
(up_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 623.94 us = 0.25% latency, 588.12 TFLOPS, in_features=4096, out_features=10936, bias=False) |
|
(down_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 584.6 us = 0.24% latency, 627.69 TFLOPS, in_features=10936, out_features=4096, bias=False) |
|
(act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 101.09 us = 0.04% latency, 443.11 GFLOPS) |
|
) |
|
) |
|
(19): DiTLayer( |
|
285.41 M = 3.1% Params, 775.38 GMACs = 3.11% MACs, 7.41 ms = 3.01% latency, 209.34 TFLOPS |
|
(input_layernorm): AdaLayerNormZero( |
|
100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 945.33 us = 0.38% latency, 3.41 TFLOPS |
|
(silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 37.67 us = 0.02% latency, 1.74 GFLOPS) |
|
(linear): Linear(100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 237.7 us = 0.1% latency, 13.55 TFLOPS, in_features=4096, out_features=24576, bias=True) |
|
(norm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 462.53 us = 0.19% latency, 0 FLOPS) |
|
) |
|
(self_attn): DiTSelfAttention( |
|
50.33 M = 0.55% Params, 223.34 GMACs = 0.9% MACs, 3.01 ms = 1.23% latency, 148.17 TFLOPS |
|
(q_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 464.44 us = 0.19% latency, 0 FLOPS) |
|
(k_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 233.89 us = 0.1% latency, 0 FLOPS) |
|
(q_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 350 us = 0.14% latency, 392.68 TFLOPS, in_features=4096, out_features=4096, bias=False) |
|
(k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 198.6 us = 0.08% latency, 173.01 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 196.22 us = 0.08% latency, 175.11 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(text_k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 158.07 us = 0.06% latency, 217.37 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(text_v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 154.97 us = 0.06% latency, 221.72 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(o_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 333.07 us = 0.14% latency, 412.64 TFLOPS, in_features=4096, out_features=4096, bias=False) |
|
) |
|
(post_attention_layernorm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 458.24 us = 0.19% latency, 0 FLOPS) |
|
(mlp): GemmaMLP( |
|
134.38 M = 1.46% Params, 550.43 GMACs = 2.21% MACs, 2.33 ms = 0.95% latency, 471.85 TFLOPS |
|
(gate_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 627.76 us = 0.26% latency, 584.54 TFLOPS, in_features=4096, out_features=10936, bias=False) |
|
(up_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 632.76 us = 0.26% latency, 579.92 TFLOPS, in_features=4096, out_features=10936, bias=False) |
|
(down_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 596.28 us = 0.24% latency, 615.4 TFLOPS, in_features=10936, out_features=4096, bias=False) |
|
(act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 235.08 us = 0.1% latency, 190.55 GFLOPS) |
|
) |
|
) |
|
(20): DiTLayer( |
|
285.41 M = 3.1% Params, 775.38 GMACs = 3.11% MACs, 7.26 ms = 2.95% latency, 213.63 TFLOPS |
|
(input_layernorm): AdaLayerNormZero( |
|
100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 948.19 us = 0.39% latency, 3.4 TFLOPS |
|
(silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 37.19 us = 0.02% latency, 1.76 GFLOPS) |
|
(linear): Linear(100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 234.84 us = 0.1% latency, 13.72 TFLOPS, in_features=4096, out_features=24576, bias=True) |
|
(norm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 463.96 us = 0.19% latency, 0 FLOPS) |
|
) |
|
(self_attn): DiTSelfAttention( |
|
50.33 M = 0.55% Params, 223.34 GMACs = 0.9% MACs, 3.05 ms = 1.24% latency, 146.58 TFLOPS |
|
(q_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 468.02 us = 0.19% latency, 0 FLOPS) |
|
(k_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 240.33 us = 0.1% latency, 0 FLOPS) |
|
(q_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 349.76 us = 0.14% latency, 392.95 TFLOPS, in_features=4096, out_features=4096, bias=False) |
|
(k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 198.36 us = 0.08% latency, 173.22 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 196.93 us = 0.08% latency, 174.47 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(text_k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 158.31 us = 0.06% latency, 217.04 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(text_v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 154.97 us = 0.06% latency, 221.72 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(o_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 344.04 us = 0.14% latency, 399.49 TFLOPS, in_features=4096, out_features=4096, bias=False) |
|
) |
|
(post_attention_layernorm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 458.48 us = 0.19% latency, 0 FLOPS) |
|
(mlp): GemmaMLP( |
|
134.38 M = 1.46% Params, 550.43 GMACs = 2.21% MACs, 2.15 ms = 0.87% latency, 512.71 TFLOPS |
|
(gate_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 628.47 us = 0.26% latency, 583.88 TFLOPS, in_features=4096, out_features=10936, bias=False) |
|
(up_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 624.42 us = 0.25% latency, 587.67 TFLOPS, in_features=4096, out_features=10936, bias=False) |
|
(down_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 589.61 us = 0.24% latency, 622.36 TFLOPS, in_features=10936, out_features=4096, bias=False) |
|
(act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 101.57 us = 0.04% latency, 441.03 GFLOPS) |
|
) |
|
) |
|
(21): DiTLayer( |
|
285.41 M = 3.1% Params, 775.38 GMACs = 3.11% MACs, 7.23 ms = 2.94% latency, 214.63 TFLOPS |
|
(input_layernorm): AdaLayerNormZero( |
|
100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 953.91 us = 0.39% latency, 3.38 TFLOPS |
|
(silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 41.48 us = 0.02% latency, 1.58 GFLOPS) |
|
(linear): Linear(100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 238.42 us = 0.1% latency, 13.51 TFLOPS, in_features=4096, out_features=24576, bias=True) |
|
(norm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 465.87 us = 0.19% latency, 0 FLOPS) |
|
) |
|
(self_attn): DiTSelfAttention( |
|
50.33 M = 0.55% Params, 223.34 GMACs = 0.9% MACs, 3.02 ms = 1.23% latency, 147.75 TFLOPS |
|
(q_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 465.87 us = 0.19% latency, 0 FLOPS) |
|
(k_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 236.51 us = 0.1% latency, 0 FLOPS) |
|
(q_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 349.52 us = 0.14% latency, 393.22 TFLOPS, in_features=4096, out_features=4096, bias=False) |
|
(k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 197.65 us = 0.08% latency, 173.84 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 194.55 us = 0.08% latency, 176.61 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(text_k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 158.07 us = 0.06% latency, 217.37 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(text_v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 155.21 us = 0.06% latency, 221.38 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(o_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 338.55 us = 0.14% latency, 405.96 TFLOPS, in_features=4096, out_features=4096, bias=False) |
|
) |
|
(post_attention_layernorm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 457.53 us = 0.19% latency, 0 FLOPS) |
|
(mlp): GemmaMLP( |
|
134.38 M = 1.46% Params, 550.43 GMACs = 2.21% MACs, 2.14 ms = 0.87% latency, 514.54 TFLOPS |
|
(gate_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 626.8 us = 0.25% latency, 585.43 TFLOPS, in_features=4096, out_features=10936, bias=False) |
|
(up_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 623.46 us = 0.25% latency, 588.57 TFLOPS, in_features=4096, out_features=10936, bias=False) |
|
(down_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 585.56 us = 0.24% latency, 626.67 TFLOPS, in_features=10936, out_features=4096, bias=False) |
|
(act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 100.85 us = 0.04% latency, 444.16 GFLOPS) |
|
) |
|
) |
|
(22): DiTLayer( |
|
285.41 M = 3.1% Params, 775.38 GMACs = 3.11% MACs, 7.2 ms = 2.93% latency, 215.32 TFLOPS |
|
(input_layernorm): AdaLayerNormZero( |
|
100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 940.08 us = 0.38% latency, 3.43 TFLOPS |
|
(silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 36.95 us = 0.02% latency, 1.77 GFLOPS) |
|
(linear): Linear(100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 237.46 us = 0.1% latency, 13.57 TFLOPS, in_features=4096, out_features=24576, bias=True) |
|
(norm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 459.19 us = 0.19% latency, 0 FLOPS) |
|
) |
|
(self_attn): DiTSelfAttention( |
|
50.33 M = 0.55% Params, 223.34 GMACs = 0.9% MACs, 3.02 ms = 1.23% latency, 148.01 TFLOPS |
|
(q_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 463.01 us = 0.19% latency, 0 FLOPS) |
|
(k_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 236.03 us = 0.1% latency, 0 FLOPS) |
|
(q_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 348.57 us = 0.14% latency, 394.3 TFLOPS, in_features=4096, out_features=4096, bias=False) |
|
(k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 198.13 us = 0.08% latency, 173.42 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 194.55 us = 0.08% latency, 176.61 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(text_k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 158.31 us = 0.06% latency, 217.04 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(text_v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 152.35 us = 0.06% latency, 225.53 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(o_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 336.89 us = 0.14% latency, 407.97 TFLOPS, in_features=4096, out_features=4096, bias=False) |
|
) |
|
(post_attention_layernorm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 457.05 us = 0.19% latency, 0 FLOPS) |
|
(mlp): GemmaMLP( |
|
134.38 M = 1.46% Params, 550.43 GMACs = 2.21% MACs, 2.14 ms = 0.87% latency, 515 TFLOPS |
|
(gate_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 627.28 us = 0.25% latency, 584.99 TFLOPS, in_features=4096, out_features=10936, bias=False) |
|
(up_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 624.66 us = 0.25% latency, 587.44 TFLOPS, in_features=4096, out_features=10936, bias=False) |
|
(down_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 584.6 us = 0.24% latency, 627.69 TFLOPS, in_features=10936, out_features=4096, bias=False) |
|
(act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 99.9 us = 0.04% latency, 448.4 GFLOPS) |
|
) |
|
) |
|
(23): DiTLayer( |
|
285.41 M = 3.1% Params, 775.38 GMACs = 3.11% MACs, 7.2 ms = 2.93% latency, 215.41 TFLOPS |
|
(input_layernorm): AdaLayerNormZero( |
|
100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 944.85 us = 0.38% latency, 3.41 TFLOPS |
|
(silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 36.95 us = 0.02% latency, 1.77 GFLOPS) |
|
(linear): Linear(100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 237.94 us = 0.1% latency, 13.54 TFLOPS, in_features=4096, out_features=24576, bias=True) |
|
(norm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 462.29 us = 0.19% latency, 0 FLOPS) |
|
) |
|
(self_attn): DiTSelfAttention( |
|
50.33 M = 0.55% Params, 223.34 GMACs = 0.9% MACs, 3.02 ms = 1.23% latency, 148.04 TFLOPS |
|
(q_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 465.39 us = 0.19% latency, 0 FLOPS) |
|
(k_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 237.7 us = 0.1% latency, 0 FLOPS) |
|
(q_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 349.04 us = 0.14% latency, 393.76 TFLOPS, in_features=4096, out_features=4096, bias=False) |
|
(k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 199.08 us = 0.08% latency, 172.59 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 195.03 us = 0.08% latency, 176.18 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(text_k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 158.79 us = 0.06% latency, 216.39 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(text_v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 156.4 us = 0.06% latency, 219.69 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(o_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 334.02 us = 0.14% latency, 411.46 TFLOPS, in_features=4096, out_features=4096, bias=False) |
|
) |
|
(post_attention_layernorm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 456.81 us = 0.19% latency, 0 FLOPS) |
|
(mlp): GemmaMLP( |
|
134.38 M = 1.46% Params, 550.43 GMACs = 2.21% MACs, 2.13 ms = 0.87% latency, 516.21 TFLOPS |
|
(gate_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 627.28 us = 0.25% latency, 584.99 TFLOPS, in_features=4096, out_features=10936, bias=False) |
|
(up_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 623.94 us = 0.25% latency, 588.12 TFLOPS, in_features=4096, out_features=10936, bias=False) |
|
(down_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 581.98 us = 0.24% latency, 630.52 TFLOPS, in_features=10936, out_features=4096, bias=False) |
|
(act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 100.61 us = 0.04% latency, 445.21 GFLOPS) |
|
) |
|
) |
|
(24): DiTLayer( |
|
285.41 M = 3.1% Params, 775.38 GMACs = 3.11% MACs, 7.23 ms = 2.94% latency, 214.37 TFLOPS |
|
(input_layernorm): AdaLayerNormZero( |
|
100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 962.5 us = 0.39% latency, 3.35 TFLOPS |
|
(silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 39.34 us = 0.02% latency, 1.67 GFLOPS) |
|
(linear): Linear(100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 241.99 us = 0.1% latency, 13.31 TFLOPS, in_features=4096, out_features=24576, bias=True) |
|
(norm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 462.53 us = 0.19% latency, 0 FLOPS) |
|
) |
|
(self_attn): DiTSelfAttention( |
|
50.33 M = 0.55% Params, 223.34 GMACs = 0.9% MACs, 3.02 ms = 1.23% latency, 147.81 TFLOPS |
|
(q_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 466.35 us = 0.19% latency, 0 FLOPS) |
|
(k_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 239.13 us = 0.1% latency, 0 FLOPS) |
|
(q_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 350 us = 0.14% latency, 392.68 TFLOPS, in_features=4096, out_features=4096, bias=False) |
|
(k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 199.08 us = 0.08% latency, 172.59 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 195.98 us = 0.08% latency, 175.32 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(text_k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 157.12 us = 0.06% latency, 218.69 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(text_v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 154.5 us = 0.06% latency, 222.4 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(o_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 334.02 us = 0.14% latency, 411.46 TFLOPS, in_features=4096, out_features=4096, bias=False) |
|
) |
|
(post_attention_layernorm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 457.76 us = 0.19% latency, 0 FLOPS) |
|
(mlp): GemmaMLP( |
|
134.38 M = 1.46% Params, 550.43 GMACs = 2.21% MACs, 2.14 ms = 0.87% latency, 514.43 TFLOPS |
|
(gate_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 627.52 us = 0.26% latency, 584.77 TFLOPS, in_features=4096, out_features=10936, bias=False) |
|
(up_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 623.94 us = 0.25% latency, 588.12 TFLOPS, in_features=4096, out_features=10936, bias=False) |
|
(down_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 584.6 us = 0.24% latency, 627.69 TFLOPS, in_features=10936, out_features=4096, bias=False) |
|
(act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 101.09 us = 0.04% latency, 443.11 GFLOPS) |
|
) |
|
) |
|
(25): DiTLayer( |
|
285.41 M = 3.1% Params, 775.38 GMACs = 3.11% MACs, 7.22 ms = 2.93% latency, 214.88 TFLOPS |
|
(input_layernorm): AdaLayerNormZero( |
|
100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 940.32 us = 0.38% latency, 3.43 TFLOPS |
|
(silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 36.48 us = 0.01% latency, 1.8 GFLOPS) |
|
(linear): Linear(100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 233.17 us = 0.09% latency, 13.81 TFLOPS, in_features=4096, out_features=24576, bias=True) |
|
(norm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 462.06 us = 0.19% latency, 0 FLOPS) |
|
) |
|
(self_attn): DiTSelfAttention( |
|
50.33 M = 0.55% Params, 223.34 GMACs = 0.9% MACs, 3.02 ms = 1.23% latency, 147.96 TFLOPS |
|
(q_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 464.44 us = 0.19% latency, 0 FLOPS) |
|
(k_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 242.95 us = 0.1% latency, 0 FLOPS) |
|
(q_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 349.04 us = 0.14% latency, 393.76 TFLOPS, in_features=4096, out_features=4096, bias=False) |
|
(k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 199.08 us = 0.08% latency, 172.59 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 195.98 us = 0.08% latency, 175.32 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(text_k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 157.12 us = 0.06% latency, 218.69 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(text_v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 153.3 us = 0.06% latency, 224.13 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(o_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 335.93 us = 0.14% latency, 409.13 TFLOPS, in_features=4096, out_features=4096, bias=False) |
|
) |
|
(post_attention_layernorm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 457.53 us = 0.19% latency, 0 FLOPS) |
|
(mlp): GemmaMLP( |
|
134.38 M = 1.46% Params, 550.43 GMACs = 2.21% MACs, 2.15 ms = 0.87% latency, 512.03 TFLOPS |
|
(gate_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 633.48 us = 0.26% latency, 579.26 TFLOPS, in_features=4096, out_features=10936, bias=False) |
|
(up_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 626.09 us = 0.25% latency, 586.1 TFLOPS, in_features=4096, out_features=10936, bias=False) |
|
(down_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 588.42 us = 0.24% latency, 623.62 TFLOPS, in_features=10936, out_features=4096, bias=False) |
|
(act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 100.37 us = 0.04% latency, 446.27 GFLOPS) |
|
) |
|
) |
|
(26): DiTLayer( |
|
285.41 M = 3.1% Params, 775.38 GMACs = 3.11% MACs, 7.31 ms = 2.97% latency, 212.01 TFLOPS |
|
(input_layernorm): AdaLayerNormZero( |
|
100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 1.03 ms = 0.42% latency, 3.13 TFLOPS |
|
(silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 36 us = 0.01% latency, 1.82 GFLOPS) |
|
(linear): Linear(100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 235.8 us = 0.1% latency, 13.66 TFLOPS, in_features=4096, out_features=24576, bias=True) |
|
(norm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 464.68 us = 0.19% latency, 0 FLOPS) |
|
) |
|
(self_attn): DiTSelfAttention( |
|
50.33 M = 0.55% Params, 223.34 GMACs = 0.9% MACs, 3.03 ms = 1.23% latency, 147.59 TFLOPS |
|
(q_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 468.02 us = 0.19% latency, 0 FLOPS) |
|
(k_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 237.23 us = 0.1% latency, 0 FLOPS) |
|
(q_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 360.01 us = 0.15% latency, 381.76 TFLOPS, in_features=4096, out_features=4096, bias=False) |
|
(k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 197.89 us = 0.08% latency, 173.63 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 194.55 us = 0.08% latency, 176.61 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(text_k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 156.64 us = 0.06% latency, 219.35 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(text_v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 153.54 us = 0.06% latency, 223.78 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(o_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 335.93 us = 0.14% latency, 409.13 TFLOPS, in_features=4096, out_features=4096, bias=False) |
|
) |
|
(post_attention_layernorm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 460.15 us = 0.19% latency, 0 FLOPS) |
|
(mlp): GemmaMLP( |
|
134.38 M = 1.46% Params, 550.43 GMACs = 2.21% MACs, 2.15 ms = 0.87% latency, 512.43 TFLOPS |
|
(gate_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 629.43 us = 0.26% latency, 582.99 TFLOPS, in_features=4096, out_features=10936, bias=False) |
|
(up_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 626.33 us = 0.25% latency, 585.88 TFLOPS, in_features=4096, out_features=10936, bias=False) |
|
(down_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 588.89 us = 0.24% latency, 623.12 TFLOPS, in_features=10936, out_features=4096, bias=False) |
|
(act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 100.85 us = 0.04% latency, 444.16 GFLOPS) |
|
) |
|
) |
|
(27): DiTLayer( |
|
285.41 M = 3.1% Params, 775.38 GMACs = 3.11% MACs, 9.77 ms = 3.97% latency, 158.72 TFLOPS |
|
(input_layernorm): AdaLayerNormZero( |
|
100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 991.82 us = 0.4% latency, 3.25 TFLOPS |
|
(silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 83.68 us = 0.03% latency, 783.13 MFLOPS) |
|
(linear): Linear(100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 236.75 us = 0.1% latency, 13.61 TFLOPS, in_features=4096, out_features=24576, bias=True) |
|
(norm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 463.49 us = 0.19% latency, 0 FLOPS) |
|
) |
|
(self_attn): DiTSelfAttention( |
|
50.33 M = 0.55% Params, 223.34 GMACs = 0.9% MACs, 3.01 ms = 1.22% latency, 148.24 TFLOPS |
|
(q_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 464.2 us = 0.19% latency, 0 FLOPS) |
|
(k_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 238.42 us = 0.1% latency, 0 FLOPS) |
|
(q_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 348.33 us = 0.14% latency, 394.57 TFLOPS, in_features=4096, out_features=4096, bias=False) |
|
(k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 197.89 us = 0.08% latency, 173.63 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 195.26 us = 0.08% latency, 175.96 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(text_k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 156.64 us = 0.06% latency, 219.35 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(text_v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 154.26 us = 0.06% latency, 222.74 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(o_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 336.41 us = 0.14% latency, 408.55 TFLOPS, in_features=4096, out_features=4096, bias=False) |
|
) |
|
(post_attention_layernorm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 458.72 us = 0.19% latency, 0 FLOPS) |
|
(mlp): GemmaMLP( |
|
134.38 M = 1.46% Params, 550.43 GMACs = 2.21% MACs, 4.59 ms = 1.87% latency, 239.65 TFLOPS |
|
(gate_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 629.66 us = 0.26% latency, 582.77 TFLOPS, in_features=4096, out_features=10936, bias=False) |
|
(up_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 884.77 us = 0.36% latency, 414.74 TFLOPS, in_features=4096, out_features=10936, bias=False) |
|
(down_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 599.62 us = 0.24% latency, 611.97 TFLOPS, in_features=10936, out_features=4096, bias=False) |
|
(act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 1.97 ms = 0.8% latency, 22.75 GFLOPS) |
|
) |
|
) |
|
(28): DiTLayer( |
|
285.41 M = 3.1% Params, 775.38 GMACs = 3.11% MACs, 7.41 ms = 3.01% latency, 209.21 TFLOPS |
|
(input_layernorm): AdaLayerNormZero( |
|
100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 982.52 us = 0.4% latency, 3.28 TFLOPS |
|
(silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 41.01 us = 0.02% latency, 1.6 GFLOPS) |
|
(linear): Linear(100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 244.14 us = 0.1% latency, 13.19 TFLOPS, in_features=4096, out_features=24576, bias=True) |
|
(norm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 469.21 us = 0.19% latency, 0 FLOPS) |
|
) |
|
(self_attn): DiTSelfAttention( |
|
50.33 M = 0.55% Params, 223.34 GMACs = 0.9% MACs, 3.09 ms = 1.25% latency, 144.71 TFLOPS |
|
(q_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 464.92 us = 0.19% latency, 0 FLOPS) |
|
(k_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 260.35 us = 0.11% latency, 0 FLOPS) |
|
(q_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 366.21 us = 0.15% latency, 375.3 TFLOPS, in_features=4096, out_features=4096, bias=False) |
|
(k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 218.39 us = 0.09% latency, 157.33 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 196.7 us = 0.08% latency, 174.69 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(text_k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 156.4 us = 0.06% latency, 219.69 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(text_v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 153.06 us = 0.06% latency, 224.48 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(o_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 339.03 us = 0.14% latency, 405.39 TFLOPS, in_features=4096, out_features=4096, bias=False) |
|
) |
|
(post_attention_layernorm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 457.29 us = 0.19% latency, 0 FLOPS) |
|
(mlp): GemmaMLP( |
|
134.38 M = 1.46% Params, 550.43 GMACs = 2.21% MACs, 2.2 ms = 0.89% latency, 501.25 TFLOPS |
|
(gate_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 640.87 us = 0.26% latency, 572.58 TFLOPS, in_features=4096, out_features=10936, bias=False) |
|
(up_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 626.8 us = 0.25% latency, 585.43 TFLOPS, in_features=4096, out_features=10936, bias=False) |
|
(down_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 588.66 us = 0.24% latency, 623.37 TFLOPS, in_features=10936, out_features=4096, bias=False) |
|
(act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 114.92 us = 0.05% latency, 389.79 GFLOPS) |
|
) |
|
) |
|
(29): DiTLayer( |
|
285.41 M = 3.1% Params, 775.38 GMACs = 3.11% MACs, 7.29 ms = 2.96% latency, 212.8 TFLOPS |
|
(input_layernorm): AdaLayerNormZero( |
|
100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 957.73 us = 0.39% latency, 3.36 TFLOPS |
|
(silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 40.05 us = 0.02% latency, 1.64 GFLOPS) |
|
(linear): Linear(100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 244.14 us = 0.1% latency, 13.19 TFLOPS, in_features=4096, out_features=24576, bias=True) |
|
(norm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 463.25 us = 0.19% latency, 0 FLOPS) |
|
) |
|
(self_attn): DiTSelfAttention( |
|
50.33 M = 0.55% Params, 223.34 GMACs = 0.9% MACs, 3.05 ms = 1.24% latency, 146.32 TFLOPS |
|
(q_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 463.49 us = 0.19% latency, 0 FLOPS) |
|
(k_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 236.75 us = 0.1% latency, 0 FLOPS) |
|
(q_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 351.19 us = 0.14% latency, 391.35 TFLOPS, in_features=4096, out_features=4096, bias=False) |
|
(k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 199.32 us = 0.08% latency, 172.39 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 194.55 us = 0.08% latency, 176.61 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(text_k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 156.16 us = 0.06% latency, 220.02 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(text_v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 175.95 us = 0.07% latency, 195.28 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(o_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 355.96 us = 0.14% latency, 386.11 TFLOPS, in_features=4096, out_features=4096, bias=False) |
|
) |
|
(post_attention_layernorm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 457.76 us = 0.19% latency, 0 FLOPS) |
|
(mlp): GemmaMLP( |
|
134.38 M = 1.46% Params, 550.43 GMACs = 2.21% MACs, 2.16 ms = 0.88% latency, 509.32 TFLOPS |
|
(gate_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 643.97 us = 0.26% latency, 569.83 TFLOPS, in_features=4096, out_features=10936, bias=False) |
|
(up_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 625.37 us = 0.25% latency, 586.77 TFLOPS, in_features=4096, out_features=10936, bias=False) |
|
(down_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 589.37 us = 0.24% latency, 622.62 TFLOPS, in_features=10936, out_features=4096, bias=False) |
|
(act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 100.85 us = 0.04% latency, 444.16 GFLOPS) |
|
) |
|
) |
|
(30): DiTLayer( |
|
285.41 M = 3.1% Params, 775.38 GMACs = 3.11% MACs, 7.26 ms = 2.95% latency, 213.5 TFLOPS |
|
(input_layernorm): AdaLayerNormZero( |
|
100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 937.22 us = 0.38% latency, 3.44 TFLOPS |
|
(silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 39.58 us = 0.02% latency, 1.66 GFLOPS) |
|
(linear): Linear(100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 231.74 us = 0.09% latency, 13.9 TFLOPS, in_features=4096, out_features=24576, bias=True) |
|
(norm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 459.91 us = 0.19% latency, 0 FLOPS) |
|
) |
|
(self_attn): DiTSelfAttention( |
|
50.33 M = 0.55% Params, 223.34 GMACs = 0.9% MACs, 3.05 ms = 1.24% latency, 146.45 TFLOPS |
|
(q_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 464.68 us = 0.19% latency, 0 FLOPS) |
|
(k_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 238.18 us = 0.1% latency, 0 FLOPS) |
|
(q_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 350 us = 0.14% latency, 392.68 TFLOPS, in_features=4096, out_features=4096, bias=False) |
|
(k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 195.98 us = 0.08% latency, 175.32 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 214.82 us = 0.09% latency, 159.95 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(text_k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 157.83 us = 0.06% latency, 217.7 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(text_v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 168.56 us = 0.07% latency, 203.84 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(o_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 338.55 us = 0.14% latency, 405.96 TFLOPS, in_features=4096, out_features=4096, bias=False) |
|
) |
|
(post_attention_layernorm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 458.24 us = 0.19% latency, 0 FLOPS) |
|
(mlp): GemmaMLP( |
|
134.38 M = 1.46% Params, 550.43 GMACs = 2.21% MACs, 2.17 ms = 0.88% latency, 508.14 TFLOPS |
|
(gate_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 649.45 us = 0.26% latency, 565.02 TFLOPS, in_features=4096, out_features=10936, bias=False) |
|
(up_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 626.56 us = 0.25% latency, 585.66 TFLOPS, in_features=4096, out_features=10936, bias=False) |
|
(down_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 587.7 us = 0.24% latency, 624.38 TFLOPS, in_features=10936, out_features=4096, bias=False) |
|
(act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 101.57 us = 0.04% latency, 441.03 GFLOPS) |
|
) |
|
) |
|
(31): DiTLayer( |
|
285.41 M = 3.1% Params, 775.38 GMACs = 3.11% MACs, 7.25 ms = 2.95% latency, 213.94 TFLOPS |
|
(input_layernorm): AdaLayerNormZero( |
|
100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 940.56 us = 0.38% latency, 3.42 TFLOPS |
|
(silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 37.19 us = 0.02% latency, 1.76 GFLOPS) |
|
(linear): Linear(100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 235.08 us = 0.1% latency, 13.7 TFLOPS, in_features=4096, out_features=24576, bias=True) |
|
(norm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 462.29 us = 0.19% latency, 0 FLOPS) |
|
) |
|
(self_attn): DiTSelfAttention( |
|
50.33 M = 0.55% Params, 223.34 GMACs = 0.9% MACs, 3.03 ms = 1.23% latency, 147.55 TFLOPS |
|
(q_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 466.11 us = 0.19% latency, 0 FLOPS) |
|
(k_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 235.32 us = 0.1% latency, 0 FLOPS) |
|
(q_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 347.38 us = 0.14% latency, 395.65 TFLOPS, in_features=4096, out_features=4096, bias=False) |
|
(k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 216.01 us = 0.09% latency, 159.07 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 194.31 us = 0.08% latency, 176.83 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(text_k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 156.16 us = 0.06% latency, 220.02 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(text_v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 154.5 us = 0.06% latency, 222.4 TFLOPS, in_features=4096, out_features=1024, bias=False) |
|
(o_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 340.46 us = 0.14% latency, 403.68 TFLOPS, in_features=4096, out_features=4096, bias=False) |
|
) |
|
(post_attention_layernorm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 456.81 us = 0.19% latency, 0 FLOPS) |
|
(mlp): GemmaMLP( |
|
134.38 M = 1.46% Params, 550.43 GMACs = 2.21% MACs, 2.17 ms = 0.88% latency, 508.42 TFLOPS |
|
(gate_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 641.82 us = 0.26% latency, 571.73 TFLOPS, in_features=4096, out_features=10936, bias=False) |
|
(up_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 624.9 us = 0.25% latency, 587.22 TFLOPS, in_features=4096, out_features=10936, bias=False) |
|
(down_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 581.98 us = 0.24% latency, 630.52 TFLOPS, in_features=10936, out_features=4096, bias=False) |
|
(act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 100.85 us = 0.04% latency, 444.16 GFLOPS) |
|
) |
|
) |
|
) |
|
(patch_embed): PatchEmbed( |
|
266.24 K = 0% Params, 1.07 GMACs = 0% MACs, 627.04 us = 0.25% latency, 3.45 TFLOPS |
|
(proj): Conv2d(266.24 K = 0% Params, 1.07 GMACs = 0% MACs, 391.01 us = 0.16% latency, 5.54 TFLOPS, 16, 4096, kernel_size=(2, 2), stride=(2, 2)) |
|
) |
|
(rotary_emb): GemmaRotaryEmbedding(0 = 0% Params, 0 MACs = 0% MACs, 0 s = 0% latency, 0 FLOPS) |
|
(time_proj): Timesteps(0 = 0% Params, 0 MACs = 0% MACs, 261.78 us = 0.11% latency, 0 FLOPS) |
|
(timestep_embedder): Sequential( |
|
17.83 M = 0.19% Params, 285.21 MMACs = 0% MACs, 520.94 us = 0.21% latency, 1.1 TFLOPS |
|
(0): Linear(1.05 M = 0.01% Params, 16.78 MMACs = 0% MACs, 221.73 us = 0.09% latency, 151.33 GFLOPS, in_features=256, out_features=4096, bias=True) |
|
(1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 41.96 us = 0.02% latency, 1.56 GFLOPS) |
|
(2): Linear(16.78 M = 0.18% Params, 268.44 MMACs = 0% MACs, 184.54 us = 0.07% latency, 2.91 TFLOPS, in_features=4096, out_features=4096, bias=True) |
|
) |
|
(context_embedder): Sequential( |
|
8.39 M = 0.09% Params, 34.36 GMACs = 0.14% MACs, 499.01 us = 0.2% latency, 137.71 TFLOPS |
|
(0): GemmaRMSNorm(2.05 K = 0% Params, 0 MACs = 0% MACs, 178.81 us = 0.07% latency, 0 FLOPS) |
|
(1): Linear(8.39 M = 0.09% Params, 34.36 GMACs = 0.14% MACs, 267.27 us = 0.11% latency, 257.12 TFLOPS, in_features=2048, out_features=4096, bias=True) |
|
) |
|
(norm_out): AdaLayerNormOut( |
|
33.57 M = 0.36% Params, 536.87 MMACs = 0% MACs, 921.01 us = 0.37% latency, 1.17 TFLOPS |
|
(silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 51.5 us = 0.02% latency, 1.27 GFLOPS) |
|
(linear): Linear(33.56 M = 0.36% Params, 536.87 MMACs = 0% MACs, 197.89 us = 0.08% latency, 5.43 TFLOPS, in_features=4096, out_features=8192, bias=True) |
|
(norm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 461.1 us = 0.19% latency, 0 FLOPS) |
|
) |
|
(proj_out): Linear(262.21 K = 0% Params, 1.07 GMACs = 0% MACs, 205.99 us = 0.08% latency, 10.42 TFLOPS, in_features=4096, out_features=64, bias=True) |
|
(repa_projector): Sequential( |
|
14.16 M = 0.15% Params, 57.98 GMACs = 0.23% MACs, 774.15 us = 0.31% latency, 149.82 TFLOPS |
|
(0): Linear(8.39 M = 0.09% Params, 34.36 GMACs = 0.14% MACs, 276.33 us = 0.11% latency, 248.69 TFLOPS, in_features=4096, out_features=2048, bias=True) |
|
(1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 33.62 us = 0.01% latency, 249.53 GFLOPS) |
|
(2): Linear(4.2 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 185.73 us = 0.08% latency, 185 TFLOPS, in_features=2048, out_features=2048, bias=True) |
|
(3): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 30.04 us = 0.01% latency, 279.24 GFLOPS) |
|
(4): Linear(1.57 M = 0.02% Params, 6.44 GMACs = 0.03% MACs, 144.48 us = 0.06% latency, 89.18 TFLOPS, in_features=2048, out_features=768, bias=True) |
|
) |
|
) |
|
------------------------------------------------------------------------------ |
|
|