Add files using upload-large-folder tool

41446d2 verified 27 days ago

97.5 kB


	-------------------------- DeepSpeed Flops Profiler --------------------------
	Profile Summary at step 2:
	Notations:
	data parallel size (dp_size), model parallel size(mp_size),
	number of parameters (params), number of multiply-accumulate operations(MACs),
	number of floating-point operations (flops), floating-point operations per second (FLOPS),
	fwd latency (forward propagation latency), bwd latency (backward propagation latency),
	step (weights update latency), iter latency (sum of fwd, bwd and step latency)

	world size: 32
	data parallel size: 32
	model parallel size: 1
	batch size per GPU: 16
	params per GPU: 9.21 B
	params of model = params per GPU * mp_size: 9.21 B
	fwd MACs per GPU: 24.91 TMACs
	fwd flops per GPU: 49.82 T
	fwd flops of model = fwd flops per GPU * mp_size: 49.82 T
	fwd latency: 246.3 ms
	fwd FLOPS per GPU = fwd flops per GPU / fwd latency: 202.25 TFLOPS
	bwd latency: 973.32 ms
	bwd FLOPS per GPU = 2 * fwd flops per GPU / bwd latency: 102.36 TFLOPS
	fwd+bwd FLOPS per GPU = 3 * fwd flops per GPU / (fwd+bwd latency): 122.54 TFLOPS
	step latency: 441.01 ms
	iter latency: 1.66 s
	FLOPS per GPU = 3 * fwd flops per GPU / iter latency: 89.99 TFLOPS
	samples/second: 308.32

	----------------------------- Aggregated Profile per GPU -----------------------------
	Top 1 modules in terms of params, MACs or fwd latency at different model depths:
	depth 0:
	params - {'DiT': '9.21 B'}
	MACs - {'DiT': '24.91 TMACs'}
	fwd latency - {'DiT': '246.07 ms'}
	depth 1:
	params - {'ModuleList': '9.13 B'}
	MACs - {'ModuleList': '24.81 TMACs'}
	fwd latency - {'ModuleList': '234.7 ms'}
	depth 2:
	params - {'DiTLayer': '9.13 B'}
	MACs - {'DiTLayer': '24.81 TMACs'}
	fwd latency - {'DiTLayer': '234.7 ms'}
	depth 3:
	params - {'GemmaMLP': '4.3 B'}
	MACs - {'GemmaMLP': '17.61 TMACs'}
	fwd latency - {'DiTSelfAttention': '96.91 ms'}

	------------------------------ Detailed Profile per GPU ------------------------------
	Each module profile is listed after its name in the following order:
	params, percentage of total params, MACs, percentage of total MACs, fwd latency, percentage of total fwd latency, fwd FLOPS

	Note: 1. A module can have torch.nn.module or torch.nn.functional to compute logits (e.g. CrossEntropyLoss). They are not counted as submodules, thus not to be printed out. However they make up the difference between a parent's MACs (or latency) and the sum of its submodules'.
	2. Number of floating-point operations is a theoretical estimation, thus FLOPS computed using that could be larger than the maximum system throughput.
	3. The fwd latency listed in the top module's profile is directly captured at the module forward function in PyTorch, thus it's less than the fwd latency shown above which is captured in DeepSpeed.

	DiT(
	9.21 B = 100% Params, 24.91 TMACs = 100% MACs, 246.07 ms = 100% latency, 202.45 TFLOPS
	(layers): ModuleList(
	(0): DiTLayer(
	285.41 M = 3.1% Params, 775.38 GMACs = 3.11% MACs, 7.34 ms = 2.98% latency, 211.41 TFLOPS
	(input_layernorm): AdaLayerNormZero(
	100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 991.11 us = 0.4% latency, 3.25 TFLOPS
	(silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 39.82 us = 0.02% latency, 1.65 GFLOPS)
	(linear): Linear(100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 270.13 us = 0.11% latency, 11.92 TFLOPS, in_features=4096, out_features=24576, bias=True)
	(norm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 462.53 us = 0.19% latency, 0 FLOPS)
	)
	(self_attn): DiTSelfAttention(
	50.33 M = 0.55% Params, 223.34 GMACs = 0.9% MACs, 3.05 ms = 1.24% latency, 146.31 TFLOPS
	(q_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 463.96 us = 0.19% latency, 0 FLOPS)
	(k_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 236.27 us = 0.1% latency, 0 FLOPS)
	(q_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 361.44 us = 0.15% latency, 380.25 TFLOPS, in_features=4096, out_features=4096, bias=False)
	(k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 199.32 us = 0.08% latency, 172.39 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 195.5 us = 0.08% latency, 175.75 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(text_k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 157.59 us = 0.06% latency, 218.03 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(text_v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 153.78 us = 0.06% latency, 223.43 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(o_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 354.53 us = 0.14% latency, 387.67 TFLOPS, in_features=4096, out_features=4096, bias=False)
	)
	(post_attention_layernorm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 457.76 us = 0.19% latency, 0 FLOPS)
	(mlp): GemmaMLP(
	134.38 M = 1.46% Params, 550.43 GMACs = 2.21% MACs, 2.18 ms = 0.88% latency, 505.97 TFLOPS
	(gate_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 628.23 us = 0.26% latency, 584.1 TFLOPS, in_features=4096, out_features=10936, bias=False)
	(up_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 624.9 us = 0.25% latency, 587.22 TFLOPS, in_features=4096, out_features=10936, bias=False)
	(down_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 595.33 us = 0.24% latency, 616.38 TFLOPS, in_features=10936, out_features=4096, bias=False)
	(act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 116.11 us = 0.05% latency, 385.79 GFLOPS)
	)
	)
	(1): DiTLayer(
	285.41 M = 3.1% Params, 775.38 GMACs = 3.11% MACs, 7.2 ms = 2.93% latency, 215.36 TFLOPS
	(input_layernorm): AdaLayerNormZero(
	100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 948.19 us = 0.39% latency, 3.4 TFLOPS
	(silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 36.72 us = 0.01% latency, 1.78 GFLOPS)
	(linear): Linear(100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 238.9 us = 0.1% latency, 13.48 TFLOPS, in_features=4096, out_features=24576, bias=True)
	(norm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 461.1 us = 0.19% latency, 0 FLOPS)
	)
	(self_attn): DiTSelfAttention(
	50.33 M = 0.55% Params, 223.34 GMACs = 0.9% MACs, 3 ms = 1.22% latency, 148.78 TFLOPS
	(q_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 462.77 us = 0.19% latency, 0 FLOPS)
	(k_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 235.32 us = 0.1% latency, 0 FLOPS)
	(q_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 348.09 us = 0.14% latency, 394.84 TFLOPS, in_features=4096, out_features=4096, bias=False)
	(k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 197.17 us = 0.08% latency, 174.26 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 194.55 us = 0.08% latency, 176.61 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(text_k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 157.83 us = 0.06% latency, 217.7 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(text_v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 154.5 us = 0.06% latency, 222.4 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(o_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 343.8 us = 0.14% latency, 399.76 TFLOPS, in_features=4096, out_features=4096, bias=False)
	)
	(post_attention_layernorm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 457.05 us = 0.19% latency, 0 FLOPS)
	(mlp): GemmaMLP(
	134.38 M = 1.46% Params, 550.43 GMACs = 2.21% MACs, 2.14 ms = 0.87% latency, 514.26 TFLOPS
	(gate_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 627.04 us = 0.25% latency, 585.21 TFLOPS, in_features=4096, out_features=10936, bias=False)
	(up_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 621.8 us = 0.25% latency, 590.15 TFLOPS, in_features=4096, out_features=10936, bias=False)
	(down_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 587.7 us = 0.24% latency, 624.38 TFLOPS, in_features=10936, out_features=4096, bias=False)
	(act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 100.85 us = 0.04% latency, 444.16 GFLOPS)
	)
	)
	(2): DiTLayer(
	285.41 M = 3.1% Params, 775.38 GMACs = 3.11% MACs, 7.22 ms = 2.93% latency, 214.76 TFLOPS
	(input_layernorm): AdaLayerNormZero(
	100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 947.48 us = 0.39% latency, 3.4 TFLOPS
	(silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 37.43 us = 0.02% latency, 1.75 GFLOPS)
	(linear): Linear(100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 238.66 us = 0.1% latency, 13.5 TFLOPS, in_features=4096, out_features=24576, bias=True)
	(norm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 462.06 us = 0.19% latency, 0 FLOPS)
	)
	(self_attn): DiTSelfAttention(
	50.33 M = 0.55% Params, 223.34 GMACs = 0.9% MACs, 3.02 ms = 1.23% latency, 148.02 TFLOPS
	(q_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 466.11 us = 0.19% latency, 0 FLOPS)
	(k_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 234.6 us = 0.1% latency, 0 FLOPS)
	(q_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 349.04 us = 0.14% latency, 393.76 TFLOPS, in_features=4096, out_features=4096, bias=False)
	(k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 198.13 us = 0.08% latency, 173.42 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 194.79 us = 0.08% latency, 176.4 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(text_k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 159.74 us = 0.06% latency, 215.1 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(text_v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 154.5 us = 0.06% latency, 222.4 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(o_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 339.98 us = 0.14% latency, 404.25 TFLOPS, in_features=4096, out_features=4096, bias=False)
	)
	(post_attention_layernorm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 458.24 us = 0.19% latency, 0 FLOPS)
	(mlp): GemmaMLP(
	134.38 M = 1.46% Params, 550.43 GMACs = 2.21% MACs, 2.15 ms = 0.87% latency, 513.11 TFLOPS
	(gate_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 633.96 us = 0.26% latency, 578.83 TFLOPS, in_features=4096, out_features=10936, bias=False)
	(up_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 620.84 us = 0.25% latency, 591.05 TFLOPS, in_features=4096, out_features=10936, bias=False)
	(down_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 587.46 us = 0.24% latency, 624.64 TFLOPS, in_features=10936, out_features=4096, bias=False)
	(act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 100.61 us = 0.04% latency, 445.21 GFLOPS)
	)
	)
	(3): DiTLayer(
	285.41 M = 3.1% Params, 775.38 GMACs = 3.11% MACs, 7.25 ms = 2.94% latency, 214.03 TFLOPS
	(input_layernorm): AdaLayerNormZero(
	100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 948.19 us = 0.39% latency, 3.4 TFLOPS
	(silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 38.15 us = 0.02% latency, 1.72 GFLOPS)
	(linear): Linear(100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 239.13 us = 0.1% latency, 13.47 TFLOPS, in_features=4096, out_features=24576, bias=True)
	(norm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 462.06 us = 0.19% latency, 0 FLOPS)
	)
	(self_attn): DiTSelfAttention(
	50.33 M = 0.55% Params, 223.34 GMACs = 0.9% MACs, 3.04 ms = 1.24% latency, 146.93 TFLOPS
	(q_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 464.68 us = 0.19% latency, 0 FLOPS)
	(k_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 238.9 us = 0.1% latency, 0 FLOPS)
	(q_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 349.76 us = 0.14% latency, 392.95 TFLOPS, in_features=4096, out_features=4096, bias=False)
	(k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 208.14 us = 0.08% latency, 165.08 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 195.5 us = 0.08% latency, 175.75 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(text_k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 159.98 us = 0.07% latency, 214.78 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(text_v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 156.16 us = 0.06% latency, 220.02 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(o_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 346.42 us = 0.14% latency, 396.74 TFLOPS, in_features=4096, out_features=4096, bias=False)
	)
	(post_attention_layernorm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 458.48 us = 0.19% latency, 0 FLOPS)
	(mlp): GemmaMLP(
	134.38 M = 1.46% Params, 550.43 GMACs = 2.21% MACs, 2.15 ms = 0.87% latency, 512.94 TFLOPS
	(gate_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 627.52 us = 0.26% latency, 584.77 TFLOPS, in_features=4096, out_features=10936, bias=False)
	(up_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 623.7 us = 0.25% latency, 588.34 TFLOPS, in_features=4096, out_features=10936, bias=False)
	(down_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 590.09 us = 0.24% latency, 621.86 TFLOPS, in_features=10936, out_features=4096, bias=False)
	(act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 101.57 us = 0.04% latency, 441.03 GFLOPS)
	)
	)
	(4): DiTLayer(
	285.41 M = 3.1% Params, 775.38 GMACs = 3.11% MACs, 7.24 ms = 2.94% latency, 214.17 TFLOPS
	(input_layernorm): AdaLayerNormZero(
	100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 959.87 us = 0.39% latency, 3.36 TFLOPS
	(silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 41.72 us = 0.02% latency, 1.57 GFLOPS)
	(linear): Linear(100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 243.66 us = 0.1% latency, 13.22 TFLOPS, in_features=4096, out_features=24576, bias=True)
	(norm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 463.72 us = 0.19% latency, 0 FLOPS)
	)
	(self_attn): DiTSelfAttention(
	50.33 M = 0.55% Params, 223.34 GMACs = 0.9% MACs, 3.03 ms = 1.23% latency, 147.57 TFLOPS
	(q_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 465.63 us = 0.19% latency, 0 FLOPS)
	(k_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 237.7 us = 0.1% latency, 0 FLOPS)
	(q_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 349.76 us = 0.14% latency, 392.95 TFLOPS, in_features=4096, out_features=4096, bias=False)
	(k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 198.6 us = 0.08% latency, 173.01 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 196.22 us = 0.08% latency, 175.11 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(text_k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 168.56 us = 0.07% latency, 203.84 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(text_v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 156.16 us = 0.06% latency, 220.02 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(o_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 334.26 us = 0.14% latency, 411.17 TFLOPS, in_features=4096, out_features=4096, bias=False)
	)
	(post_attention_layernorm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 459.19 us = 0.19% latency, 0 FLOPS)
	(mlp): GemmaMLP(
	134.38 M = 1.46% Params, 550.43 GMACs = 2.21% MACs, 2.14 ms = 0.87% latency, 513.91 TFLOPS
	(gate_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 625.13 us = 0.25% latency, 587 TFLOPS, in_features=4096, out_features=10936, bias=False)
	(up_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 623.94 us = 0.25% latency, 588.12 TFLOPS, in_features=4096, out_features=10936, bias=False)
	(down_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 586.03 us = 0.24% latency, 626.16 TFLOPS, in_features=10936, out_features=4096, bias=False)
	(act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 101.33 us = 0.04% latency, 442.07 GFLOPS)
	)
	)
	(5): DiTLayer(
	285.41 M = 3.1% Params, 775.38 GMACs = 3.11% MACs, 7.24 ms = 2.94% latency, 214.34 TFLOPS
	(input_layernorm): AdaLayerNormZero(
	100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 960.59 us = 0.39% latency, 3.35 TFLOPS
	(silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 39.1 us = 0.02% latency, 1.68 GFLOPS)
	(linear): Linear(100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 246.05 us = 0.1% latency, 13.09 TFLOPS, in_features=4096, out_features=24576, bias=True)
	(norm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 463.72 us = 0.19% latency, 0 FLOPS)
	)
	(self_attn): DiTSelfAttention(
	50.33 M = 0.55% Params, 223.34 GMACs = 0.9% MACs, 3.01 ms = 1.22% latency, 148.22 TFLOPS
	(q_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 463.01 us = 0.19% latency, 0 FLOPS)
	(k_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 236.75 us = 0.1% latency, 0 FLOPS)
	(q_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 349.52 us = 0.14% latency, 393.22 TFLOPS, in_features=4096, out_features=4096, bias=False)
	(k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 197.65 us = 0.08% latency, 173.84 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 195.5 us = 0.08% latency, 175.75 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(text_k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 158.55 us = 0.06% latency, 216.71 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(text_v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 155.45 us = 0.06% latency, 221.04 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(o_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 333.79 us = 0.14% latency, 411.76 TFLOPS, in_features=4096, out_features=4096, bias=False)
	)
	(post_attention_layernorm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 458 us = 0.19% latency, 0 FLOPS)
	(mlp): GemmaMLP(
	134.38 M = 1.46% Params, 550.43 GMACs = 2.21% MACs, 2.15 ms = 0.87% latency, 512.26 TFLOPS
	(gate_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 626.8 us = 0.25% latency, 585.43 TFLOPS, in_features=4096, out_features=10936, bias=False)
	(up_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 623.7 us = 0.25% latency, 588.34 TFLOPS, in_features=4096, out_features=10936, bias=False)
	(down_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 586.99 us = 0.24% latency, 625.14 TFLOPS, in_features=10936, out_features=4096, bias=False)
	(act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 103 us = 0.04% latency, 434.91 GFLOPS)
	)
	)
	(6): DiTLayer(
	285.41 M = 3.1% Params, 775.38 GMACs = 3.11% MACs, 7.25 ms = 2.95% latency, 213.89 TFLOPS
	(input_layernorm): AdaLayerNormZero(
	100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 994.21 us = 0.4% latency, 3.24 TFLOPS
	(silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 40.77 us = 0.02% latency, 1.61 GFLOPS)
	(linear): Linear(100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 237.46 us = 0.1% latency, 13.57 TFLOPS, in_features=4096, out_features=24576, bias=True)
	(norm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 463.01 us = 0.19% latency, 0 FLOPS)
	)
	(self_attn): DiTSelfAttention(
	50.33 M = 0.55% Params, 223.34 GMACs = 0.9% MACs, 3.01 ms = 1.22% latency, 148.35 TFLOPS
	(q_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 464.2 us = 0.19% latency, 0 FLOPS)
	(k_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 235.08 us = 0.1% latency, 0 FLOPS)
	(q_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 347.85 us = 0.14% latency, 395.11 TFLOPS, in_features=4096, out_features=4096, bias=False)
	(k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 196.7 us = 0.08% latency, 174.69 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 193.6 us = 0.08% latency, 177.48 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(text_k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 156.88 us = 0.06% latency, 219.02 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(text_v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 154.26 us = 0.06% latency, 222.74 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(o_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 340.22 us = 0.14% latency, 403.97 TFLOPS, in_features=4096, out_features=4096, bias=False)
	)
	(post_attention_layernorm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 458.72 us = 0.19% latency, 0 FLOPS)
	(mlp): GemmaMLP(
	134.38 M = 1.46% Params, 550.43 GMACs = 2.21% MACs, 2.14 ms = 0.87% latency, 515.29 TFLOPS
	(gate_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 626.33 us = 0.25% latency, 585.88 TFLOPS, in_features=4096, out_features=10936, bias=False)
	(up_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 622.27 us = 0.25% latency, 589.7 TFLOPS, in_features=4096, out_features=10936, bias=False)
	(down_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 583.17 us = 0.24% latency, 629.23 TFLOPS, in_features=10936, out_features=4096, bias=False)
	(act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 100.37 us = 0.04% latency, 446.27 GFLOPS)
	)
	)
	(7): DiTLayer(
	285.41 M = 3.1% Params, 775.38 GMACs = 3.11% MACs, 7.2 ms = 2.93% latency, 215.45 TFLOPS
	(input_layernorm): AdaLayerNormZero(
	100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 946.28 us = 0.38% latency, 3.4 TFLOPS
	(silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 36 us = 0.01% latency, 1.82 GFLOPS)
	(linear): Linear(100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 242.71 us = 0.1% latency, 13.27 TFLOPS, in_features=4096, out_features=24576, bias=True)
	(norm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 461.1 us = 0.19% latency, 0 FLOPS)
	)
	(self_attn): DiTSelfAttention(
	50.33 M = 0.55% Params, 223.34 GMACs = 0.9% MACs, 3.01 ms = 1.22% latency, 148.48 TFLOPS
	(q_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 464.68 us = 0.19% latency, 0 FLOPS)
	(k_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 235.08 us = 0.1% latency, 0 FLOPS)
	(q_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 350 us = 0.14% latency, 392.68 TFLOPS, in_features=4096, out_features=4096, bias=False)
	(k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 198.84 us = 0.08% latency, 172.8 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 194.79 us = 0.08% latency, 176.4 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(text_k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 155.45 us = 0.06% latency, 221.04 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(text_v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 153.54 us = 0.06% latency, 223.78 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(o_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 335.45 us = 0.14% latency, 409.71 TFLOPS, in_features=4096, out_features=4096, bias=False)
	)
	(post_attention_layernorm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 456.57 us = 0.19% latency, 0 FLOPS)
	(mlp): GemmaMLP(
	134.38 M = 1.46% Params, 550.43 GMACs = 2.21% MACs, 2.14 ms = 0.87% latency, 515.12 TFLOPS
	(gate_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 627.28 us = 0.25% latency, 584.99 TFLOPS, in_features=4096, out_features=10936, bias=False)
	(up_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 622.27 us = 0.25% latency, 589.7 TFLOPS, in_features=4096, out_features=10936, bias=False)
	(down_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 583.17 us = 0.24% latency, 629.23 TFLOPS, in_features=10936, out_features=4096, bias=False)
	(act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 100.14 us = 0.04% latency, 447.33 GFLOPS)
	)
	)
	(8): DiTLayer(
	285.41 M = 3.1% Params, 775.38 GMACs = 3.11% MACs, 7.27 ms = 2.95% latency, 213.4 TFLOPS
	(input_layernorm): AdaLayerNormZero(
	100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 960.35 us = 0.39% latency, 3.35 TFLOPS
	(silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 43.39 us = 0.02% latency, 1.51 GFLOPS)
	(linear): Linear(100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 238.18 us = 0.1% latency, 13.52 TFLOPS, in_features=4096, out_features=24576, bias=True)
	(norm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 463.72 us = 0.19% latency, 0 FLOPS)
	)
	(self_attn): DiTSelfAttention(
	50.33 M = 0.55% Params, 223.34 GMACs = 0.9% MACs, 3.01 ms = 1.23% latency, 148.17 TFLOPS
	(q_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 462.53 us = 0.19% latency, 0 FLOPS)
	(k_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 236.27 us = 0.1% latency, 0 FLOPS)
	(q_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 348.33 us = 0.14% latency, 394.57 TFLOPS, in_features=4096, out_features=4096, bias=False)
	(k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 198.36 us = 0.08% latency, 173.22 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 194.55 us = 0.08% latency, 176.61 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(text_k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 156.4 us = 0.06% latency, 219.69 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(text_v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 154.97 us = 0.06% latency, 221.72 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(o_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 341.89 us = 0.14% latency, 401.99 TFLOPS, in_features=4096, out_features=4096, bias=False)
	)
	(post_attention_layernorm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 458 us = 0.19% latency, 0 FLOPS)
	(mlp): GemmaMLP(
	134.38 M = 1.46% Params, 550.43 GMACs = 2.21% MACs, 2.18 ms = 0.89% latency, 504.81 TFLOPS
	(gate_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 627.52 us = 0.26% latency, 584.77 TFLOPS, in_features=4096, out_features=10936, bias=False)
	(up_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 624.18 us = 0.25% latency, 587.89 TFLOPS, in_features=4096, out_features=10936, bias=False)
	(down_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 587.7 us = 0.24% latency, 624.38 TFLOPS, in_features=10936, out_features=4096, bias=False)
	(act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 103.24 us = 0.04% latency, 433.9 GFLOPS)
	)
	)
	(9): DiTLayer(
	285.41 M = 3.1% Params, 775.38 GMACs = 3.11% MACs, 7.27 ms = 2.96% latency, 213.18 TFLOPS
	(input_layernorm): AdaLayerNormZero(
	100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 957.73 us = 0.39% latency, 3.36 TFLOPS
	(silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 38.86 us = 0.02% latency, 1.69 GFLOPS)
	(linear): Linear(100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 240.33 us = 0.1% latency, 13.4 TFLOPS, in_features=4096, out_features=24576, bias=True)
	(norm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 463.72 us = 0.19% latency, 0 FLOPS)
	)
	(self_attn): DiTSelfAttention(
	50.33 M = 0.55% Params, 223.34 GMACs = 0.9% MACs, 3.03 ms = 1.23% latency, 147.24 TFLOPS
	(q_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 464.92 us = 0.19% latency, 0 FLOPS)
	(k_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 236.03 us = 0.1% latency, 0 FLOPS)
	(q_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 350 us = 0.14% latency, 392.68 TFLOPS, in_features=4096, out_features=4096, bias=False)
	(k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 199.08 us = 0.08% latency, 172.59 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 194.79 us = 0.08% latency, 176.4 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(text_k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 157.59 us = 0.06% latency, 218.03 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(text_v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 153.78 us = 0.06% latency, 223.43 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(o_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 347.85 us = 0.14% latency, 395.11 TFLOPS, in_features=4096, out_features=4096, bias=False)
	)
	(post_attention_layernorm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 459.19 us = 0.19% latency, 0 FLOPS)
	(mlp): GemmaMLP(
	134.38 M = 1.46% Params, 550.43 GMACs = 2.21% MACs, 2.16 ms = 0.88% latency, 510.73 TFLOPS
	(gate_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 635.15 us = 0.26% latency, 577.74 TFLOPS, in_features=4096, out_features=10936, bias=False)
	(up_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 623.46 us = 0.25% latency, 588.57 TFLOPS, in_features=4096, out_features=10936, bias=False)
	(down_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 587.46 us = 0.24% latency, 624.64 TFLOPS, in_features=10936, out_features=4096, bias=False)
	(act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 102.76 us = 0.04% latency, 435.91 GFLOPS)
	)
	)
	(10): DiTLayer(
	285.41 M = 3.1% Params, 775.38 GMACs = 3.11% MACs, 7.27 ms = 2.96% latency, 213.26 TFLOPS
	(input_layernorm): AdaLayerNormZero(
	100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 949.38 us = 0.39% latency, 3.39 TFLOPS
	(silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 36.72 us = 0.01% latency, 1.78 GFLOPS)
	(linear): Linear(100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 241.28 us = 0.1% latency, 13.35 TFLOPS, in_features=4096, out_features=24576, bias=True)
	(norm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 461.58 us = 0.19% latency, 0 FLOPS)
	)
	(self_attn): DiTSelfAttention(
	50.33 M = 0.55% Params, 223.34 GMACs = 0.9% MACs, 3.03 ms = 1.23% latency, 147.18 TFLOPS
	(q_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 466.11 us = 0.19% latency, 0 FLOPS)
	(k_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 237.46 us = 0.1% latency, 0 FLOPS)
	(q_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 349.28 us = 0.14% latency, 393.49 TFLOPS, in_features=4096, out_features=4096, bias=False)
	(k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 198.13 us = 0.08% latency, 173.42 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 194.79 us = 0.08% latency, 176.4 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(text_k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 158.31 us = 0.06% latency, 217.04 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(text_v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 154.97 us = 0.06% latency, 221.72 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(o_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 347.14 us = 0.14% latency, 395.92 TFLOPS, in_features=4096, out_features=4096, bias=False)
	)
	(post_attention_layernorm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 462.06 us = 0.19% latency, 0 FLOPS)
	(mlp): GemmaMLP(
	134.38 M = 1.46% Params, 550.43 GMACs = 2.21% MACs, 2.17 ms = 0.88% latency, 508.48 TFLOPS
	(gate_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 629.19 us = 0.26% latency, 583.22 TFLOPS, in_features=4096, out_features=10936, bias=False)
	(up_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 623.94 us = 0.25% latency, 588.12 TFLOPS, in_features=4096, out_features=10936, bias=False)
	(down_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 594.38 us = 0.24% latency, 617.37 TFLOPS, in_features=10936, out_features=4096, bias=False)
	(act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 102.52 us = 0.04% latency, 436.93 GFLOPS)
	)
	)
	(11): DiTLayer(
	285.41 M = 3.1% Params, 775.38 GMACs = 3.11% MACs, 7.31 ms = 2.97% latency, 212.09 TFLOPS
	(input_layernorm): AdaLayerNormZero(
	100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 954.63 us = 0.39% latency, 3.37 TFLOPS
	(silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 38.15 us = 0.02% latency, 1.72 GFLOPS)
	(linear): Linear(100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 239.85 us = 0.1% latency, 13.43 TFLOPS, in_features=4096, out_features=24576, bias=True)
	(norm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 462.06 us = 0.19% latency, 0 FLOPS)
	)
	(self_attn): DiTSelfAttention(
	50.33 M = 0.55% Params, 223.34 GMACs = 0.9% MACs, 3.09 ms = 1.26% latency, 144.35 TFLOPS
	(q_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 467.54 us = 0.19% latency, 0 FLOPS)
	(k_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 236.75 us = 0.1% latency, 0 FLOPS)
	(q_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 354.05 us = 0.14% latency, 388.19 TFLOPS, in_features=4096, out_features=4096, bias=False)
	(k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 202.89 us = 0.08% latency, 169.35 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 197.89 us = 0.08% latency, 173.63 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(text_k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 160.46 us = 0.07% latency, 214.14 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(text_v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 160.46 us = 0.07% latency, 214.14 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(o_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 340.7 us = 0.14% latency, 403.4 TFLOPS, in_features=4096, out_features=4096, bias=False)
	)
	(post_attention_layernorm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 458.24 us = 0.19% latency, 0 FLOPS)
	(mlp): GemmaMLP(
	134.38 M = 1.46% Params, 550.43 GMACs = 2.21% MACs, 2.15 ms = 0.87% latency, 512.88 TFLOPS
	(gate_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 626.09 us = 0.25% latency, 586.1 TFLOPS, in_features=4096, out_features=10936, bias=False)
	(up_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 623.23 us = 0.25% latency, 588.79 TFLOPS, in_features=4096, out_features=10936, bias=False)
	(down_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 588.66 us = 0.24% latency, 623.37 TFLOPS, in_features=10936, out_features=4096, bias=False)
	(act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 101.33 us = 0.04% latency, 442.07 GFLOPS)
	)
	)
	(12): DiTLayer(
	285.41 M = 3.1% Params, 775.38 GMACs = 3.11% MACs, 7.21 ms = 2.93% latency, 215.14 TFLOPS
	(input_layernorm): AdaLayerNormZero(
	100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 944.38 us = 0.38% latency, 3.41 TFLOPS
	(silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 36.95 us = 0.02% latency, 1.77 GFLOPS)
	(linear): Linear(100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 235.8 us = 0.1% latency, 13.66 TFLOPS, in_features=4096, out_features=24576, bias=True)
	(norm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 460.86 us = 0.19% latency, 0 FLOPS)
	)
	(self_attn): DiTSelfAttention(
	50.33 M = 0.55% Params, 223.34 GMACs = 0.9% MACs, 3.01 ms = 1.22% latency, 148.27 TFLOPS
	(q_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 464.44 us = 0.19% latency, 0 FLOPS)
	(k_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 238.42 us = 0.1% latency, 0 FLOPS)
	(q_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 347.38 us = 0.14% latency, 395.65 TFLOPS, in_features=4096, out_features=4096, bias=False)
	(k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 198.13 us = 0.08% latency, 173.42 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 195.98 us = 0.08% latency, 175.32 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(text_k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 158.07 us = 0.06% latency, 217.37 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(text_v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 155.69 us = 0.06% latency, 220.7 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(o_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 337.84 us = 0.14% latency, 406.82 TFLOPS, in_features=4096, out_features=4096, bias=False)
	)
	(post_attention_layernorm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 458.48 us = 0.19% latency, 0 FLOPS)
	(mlp): GemmaMLP(
	134.38 M = 1.46% Params, 550.43 GMACs = 2.21% MACs, 2.14 ms = 0.87% latency, 514.26 TFLOPS
	(gate_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 624.9 us = 0.25% latency, 587.22 TFLOPS, in_features=4096, out_features=10936, bias=False)
	(up_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 622.75 us = 0.25% latency, 589.24 TFLOPS, in_features=4096, out_features=10936, bias=False)
	(down_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 587.22 us = 0.24% latency, 624.89 TFLOPS, in_features=10936, out_features=4096, bias=False)
	(act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 101.8 us = 0.04% latency, 440 GFLOPS)
	)
	)
	(13): DiTLayer(
	285.41 M = 3.1% Params, 775.38 GMACs = 3.11% MACs, 7.24 ms = 2.94% latency, 214.3 TFLOPS
	(input_layernorm): AdaLayerNormZero(
	100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 960.11 us = 0.39% latency, 3.36 TFLOPS
	(silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 39.34 us = 0.02% latency, 1.67 GFLOPS)
	(linear): Linear(100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 246.05 us = 0.1% latency, 13.09 TFLOPS, in_features=4096, out_features=24576, bias=True)
	(norm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 464.2 us = 0.19% latency, 0 FLOPS)
	)
	(self_attn): DiTSelfAttention(
	50.33 M = 0.55% Params, 223.34 GMACs = 0.9% MACs, 3.03 ms = 1.23% latency, 147.59 TFLOPS
	(q_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 464.44 us = 0.19% latency, 0 FLOPS)
	(k_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 236.27 us = 0.1% latency, 0 FLOPS)
	(q_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 348.09 us = 0.14% latency, 394.84 TFLOPS, in_features=4096, out_features=4096, bias=False)
	(k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 199.32 us = 0.08% latency, 172.39 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 195.26 us = 0.08% latency, 175.96 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(text_k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 168.56 us = 0.07% latency, 203.84 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(text_v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 155.69 us = 0.06% latency, 220.7 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(o_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 338.08 us = 0.14% latency, 406.53 TFLOPS, in_features=4096, out_features=4096, bias=False)
	)
	(post_attention_layernorm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 459.19 us = 0.19% latency, 0 FLOPS)
	(mlp): GemmaMLP(
	134.38 M = 1.46% Params, 550.43 GMACs = 2.21% MACs, 2.14 ms = 0.87% latency, 514.6 TFLOPS
	(gate_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 626.33 us = 0.25% latency, 585.88 TFLOPS, in_features=4096, out_features=10936, bias=False)
	(up_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 621.8 us = 0.25% latency, 590.15 TFLOPS, in_features=4096, out_features=10936, bias=False)
	(down_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 587.94 us = 0.24% latency, 624.13 TFLOPS, in_features=10936, out_features=4096, bias=False)
	(act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 101.57 us = 0.04% latency, 441.03 GFLOPS)
	)
	)
	(14): DiTLayer(
	285.41 M = 3.1% Params, 775.38 GMACs = 3.11% MACs, 7.21 ms = 2.93% latency, 214.95 TFLOPS
	(input_layernorm): AdaLayerNormZero(
	100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 953.2 us = 0.39% latency, 3.38 TFLOPS
	(silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 39.58 us = 0.02% latency, 1.66 GFLOPS)
	(linear): Linear(100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 242.23 us = 0.1% latency, 13.3 TFLOPS, in_features=4096, out_features=24576, bias=True)
	(norm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 461.34 us = 0.19% latency, 0 FLOPS)
	)
	(self_attn): DiTSelfAttention(
	50.33 M = 0.55% Params, 223.34 GMACs = 0.9% MACs, 3.01 ms = 1.22% latency, 148.21 TFLOPS
	(q_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 463.96 us = 0.19% latency, 0 FLOPS)
	(k_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 236.03 us = 0.1% latency, 0 FLOPS)
	(q_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 348.81 us = 0.14% latency, 394.03 TFLOPS, in_features=4096, out_features=4096, bias=False)
	(k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 197.65 us = 0.08% latency, 173.84 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 193.83 us = 0.08% latency, 177.26 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(text_k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 156.4 us = 0.06% latency, 219.69 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(text_v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 154.26 us = 0.06% latency, 222.74 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(o_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 339.98 us = 0.14% latency, 404.25 TFLOPS, in_features=4096, out_features=4096, bias=False)
	)
	(post_attention_layernorm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 458.48 us = 0.19% latency, 0 FLOPS)
	(mlp): GemmaMLP(
	134.38 M = 1.46% Params, 550.43 GMACs = 2.21% MACs, 2.14 ms = 0.87% latency, 515.29 TFLOPS
	(gate_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 624.66 us = 0.25% latency, 587.44 TFLOPS, in_features=4096, out_features=10936, bias=False)
	(up_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 621.56 us = 0.25% latency, 590.37 TFLOPS, in_features=4096, out_features=10936, bias=False)
	(down_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 587.7 us = 0.24% latency, 624.38 TFLOPS, in_features=10936, out_features=4096, bias=False)
	(act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 100.61 us = 0.04% latency, 445.21 GFLOPS)
	)
	)
	(15): DiTLayer(
	285.41 M = 3.1% Params, 775.38 GMACs = 3.11% MACs, 7.23 ms = 2.94% latency, 214.39 TFLOPS
	(input_layernorm): AdaLayerNormZero(
	100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 954.15 us = 0.39% latency, 3.38 TFLOPS
	(silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 40.05 us = 0.02% latency, 1.64 GFLOPS)
	(linear): Linear(100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 240.33 us = 0.1% latency, 13.4 TFLOPS, in_features=4096, out_features=24576, bias=True)
	(norm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 464.92 us = 0.19% latency, 0 FLOPS)
	)
	(self_attn): DiTSelfAttention(
	50.33 M = 0.55% Params, 223.34 GMACs = 0.9% MACs, 3.02 ms = 1.23% latency, 147.86 TFLOPS
	(q_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 463.96 us = 0.19% latency, 0 FLOPS)
	(k_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 236.03 us = 0.1% latency, 0 FLOPS)
	(q_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 350.48 us = 0.14% latency, 392.15 TFLOPS, in_features=4096, out_features=4096, bias=False)
	(k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 198.84 us = 0.08% latency, 172.8 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 195.5 us = 0.08% latency, 175.75 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(text_k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 158.79 us = 0.06% latency, 216.39 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(text_v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 154.26 us = 0.06% latency, 222.74 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(o_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 340.7 us = 0.14% latency, 403.4 TFLOPS, in_features=4096, out_features=4096, bias=False)
	)
	(post_attention_layernorm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 458.24 us = 0.19% latency, 0 FLOPS)
	(mlp): GemmaMLP(
	134.38 M = 1.46% Params, 550.43 GMACs = 2.21% MACs, 2.14 ms = 0.87% latency, 513.74 TFLOPS
	(gate_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 628.47 us = 0.26% latency, 583.88 TFLOPS, in_features=4096, out_features=10936, bias=False)
	(up_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 624.9 us = 0.25% latency, 587.22 TFLOPS, in_features=4096, out_features=10936, bias=False)
	(down_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 586.03 us = 0.24% latency, 626.16 TFLOPS, in_features=10936, out_features=4096, bias=False)
	(act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 101.09 us = 0.04% latency, 443.11 GFLOPS)
	)
	)
	(16): DiTLayer(
	285.41 M = 3.1% Params, 775.38 GMACs = 3.11% MACs, 7.28 ms = 2.96% latency, 212.96 TFLOPS
	(input_layernorm): AdaLayerNormZero(
	100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 967.03 us = 0.39% latency, 3.33 TFLOPS
	(silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 48.16 us = 0.02% latency, 1.36 GFLOPS)
	(linear): Linear(100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 244.86 us = 0.1% latency, 13.16 TFLOPS, in_features=4096, out_features=24576, bias=True)
	(norm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 463.96 us = 0.19% latency, 0 FLOPS)
	)
	(self_attn): DiTSelfAttention(
	50.33 M = 0.55% Params, 223.34 GMACs = 0.9% MACs, 3.04 ms = 1.23% latency, 147.06 TFLOPS
	(q_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 465.39 us = 0.19% latency, 0 FLOPS)
	(k_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 236.51 us = 0.1% latency, 0 FLOPS)
	(q_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 351.67 us = 0.14% latency, 390.82 TFLOPS, in_features=4096, out_features=4096, bias=False)
	(k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 197.65 us = 0.08% latency, 173.84 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 195.5 us = 0.08% latency, 175.75 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(text_k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 158.55 us = 0.06% latency, 216.71 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(text_v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 154.26 us = 0.06% latency, 222.74 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(o_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 344.28 us = 0.14% latency, 399.21 TFLOPS, in_features=4096, out_features=4096, bias=False)
	)
	(post_attention_layernorm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 457.53 us = 0.19% latency, 0 FLOPS)
	(mlp): GemmaMLP(
	134.38 M = 1.46% Params, 550.43 GMACs = 2.21% MACs, 2.16 ms = 0.88% latency, 509.04 TFLOPS
	(gate_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 631.81 us = 0.26% latency, 580.79 TFLOPS, in_features=4096, out_features=10936, bias=False)
	(up_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 625.85 us = 0.25% latency, 586.33 TFLOPS, in_features=4096, out_features=10936, bias=False)
	(down_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 596.76 us = 0.24% latency, 614.9 TFLOPS, in_features=10936, out_features=4096, bias=False)
	(act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 101.33 us = 0.04% latency, 442.07 GFLOPS)
	)
	)
	(17): DiTLayer(
	285.41 M = 3.1% Params, 775.38 GMACs = 3.11% MACs, 7.22 ms = 2.94% latency, 214.68 TFLOPS
	(input_layernorm): AdaLayerNormZero(
	100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 951.29 us = 0.39% latency, 3.39 TFLOPS
	(silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 38.62 us = 0.02% latency, 1.7 GFLOPS)
	(linear): Linear(100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 240.8 us = 0.1% latency, 13.38 TFLOPS, in_features=4096, out_features=24576, bias=True)
	(norm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 461.58 us = 0.19% latency, 0 FLOPS)
	)
	(self_attn): DiTSelfAttention(
	50.33 M = 0.55% Params, 223.34 GMACs = 0.9% MACs, 3.01 ms = 1.22% latency, 148.22 TFLOPS
	(q_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 465.39 us = 0.19% latency, 0 FLOPS)
	(k_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 235.32 us = 0.1% latency, 0 FLOPS)
	(q_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 348.09 us = 0.14% latency, 394.84 TFLOPS, in_features=4096, out_features=4096, bias=False)
	(k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 196.93 us = 0.08% latency, 174.47 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 194.55 us = 0.08% latency, 176.61 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(text_k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 156.4 us = 0.06% latency, 219.69 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(text_v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 155.93 us = 0.06% latency, 220.36 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(o_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 342.61 us = 0.14% latency, 401.16 TFLOPS, in_features=4096, out_features=4096, bias=False)
	)
	(post_attention_layernorm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 458.24 us = 0.19% latency, 0 FLOPS)
	(mlp): GemmaMLP(
	134.38 M = 1.46% Params, 550.43 GMACs = 2.21% MACs, 2.15 ms = 0.87% latency, 513 TFLOPS
	(gate_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 629.43 us = 0.26% latency, 582.99 TFLOPS, in_features=4096, out_features=10936, bias=False)
	(up_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 625.37 us = 0.25% latency, 586.77 TFLOPS, in_features=4096, out_features=10936, bias=False)
	(down_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 585.08 us = 0.24% latency, 627.18 TFLOPS, in_features=10936, out_features=4096, bias=False)
	(act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 101.09 us = 0.04% latency, 443.11 GFLOPS)
	)
	)
	(18): DiTLayer(
	285.41 M = 3.1% Params, 775.38 GMACs = 3.11% MACs, 7.2 ms = 2.93% latency, 215.25 TFLOPS
	(input_layernorm): AdaLayerNormZero(
	100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 938.89 us = 0.38% latency, 3.43 TFLOPS
	(silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 37.19 us = 0.02% latency, 1.76 GFLOPS)
	(linear): Linear(100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 234.6 us = 0.1% latency, 13.73 TFLOPS, in_features=4096, out_features=24576, bias=True)
	(norm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 461.1 us = 0.19% latency, 0 FLOPS)
	)
	(self_attn): DiTSelfAttention(
	50.33 M = 0.55% Params, 223.34 GMACs = 0.9% MACs, 3.01 ms = 1.23% latency, 148.16 TFLOPS
	(q_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 465.15 us = 0.19% latency, 0 FLOPS)
	(k_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 236.27 us = 0.1% latency, 0 FLOPS)
	(q_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 350.71 us = 0.14% latency, 391.88 TFLOPS, in_features=4096, out_features=4096, bias=False)
	(k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 197.17 us = 0.08% latency, 174.26 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 195.03 us = 0.08% latency, 176.18 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(text_k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 155.93 us = 0.06% latency, 220.36 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(text_v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 154.26 us = 0.06% latency, 222.74 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(o_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 334.02 us = 0.14% latency, 411.46 TFLOPS, in_features=4096, out_features=4096, bias=False)
	)
	(post_attention_layernorm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 460.15 us = 0.19% latency, 0 FLOPS)
	(mlp): GemmaMLP(
	134.38 M = 1.46% Params, 550.43 GMACs = 2.21% MACs, 2.14 ms = 0.87% latency, 514.71 TFLOPS
	(gate_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 627.28 us = 0.25% latency, 584.99 TFLOPS, in_features=4096, out_features=10936, bias=False)
	(up_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 623.94 us = 0.25% latency, 588.12 TFLOPS, in_features=4096, out_features=10936, bias=False)
	(down_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 584.6 us = 0.24% latency, 627.69 TFLOPS, in_features=10936, out_features=4096, bias=False)
	(act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 101.09 us = 0.04% latency, 443.11 GFLOPS)
	)
	)
	(19): DiTLayer(
	285.41 M = 3.1% Params, 775.38 GMACs = 3.11% MACs, 7.41 ms = 3.01% latency, 209.34 TFLOPS
	(input_layernorm): AdaLayerNormZero(
	100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 945.33 us = 0.38% latency, 3.41 TFLOPS
	(silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 37.67 us = 0.02% latency, 1.74 GFLOPS)
	(linear): Linear(100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 237.7 us = 0.1% latency, 13.55 TFLOPS, in_features=4096, out_features=24576, bias=True)
	(norm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 462.53 us = 0.19% latency, 0 FLOPS)
	)
	(self_attn): DiTSelfAttention(
	50.33 M = 0.55% Params, 223.34 GMACs = 0.9% MACs, 3.01 ms = 1.23% latency, 148.17 TFLOPS
	(q_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 464.44 us = 0.19% latency, 0 FLOPS)
	(k_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 233.89 us = 0.1% latency, 0 FLOPS)
	(q_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 350 us = 0.14% latency, 392.68 TFLOPS, in_features=4096, out_features=4096, bias=False)
	(k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 198.6 us = 0.08% latency, 173.01 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 196.22 us = 0.08% latency, 175.11 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(text_k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 158.07 us = 0.06% latency, 217.37 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(text_v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 154.97 us = 0.06% latency, 221.72 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(o_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 333.07 us = 0.14% latency, 412.64 TFLOPS, in_features=4096, out_features=4096, bias=False)
	)
	(post_attention_layernorm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 458.24 us = 0.19% latency, 0 FLOPS)
	(mlp): GemmaMLP(
	134.38 M = 1.46% Params, 550.43 GMACs = 2.21% MACs, 2.33 ms = 0.95% latency, 471.85 TFLOPS
	(gate_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 627.76 us = 0.26% latency, 584.54 TFLOPS, in_features=4096, out_features=10936, bias=False)
	(up_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 632.76 us = 0.26% latency, 579.92 TFLOPS, in_features=4096, out_features=10936, bias=False)
	(down_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 596.28 us = 0.24% latency, 615.4 TFLOPS, in_features=10936, out_features=4096, bias=False)
	(act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 235.08 us = 0.1% latency, 190.55 GFLOPS)
	)
	)
	(20): DiTLayer(
	285.41 M = 3.1% Params, 775.38 GMACs = 3.11% MACs, 7.26 ms = 2.95% latency, 213.63 TFLOPS
	(input_layernorm): AdaLayerNormZero(
	100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 948.19 us = 0.39% latency, 3.4 TFLOPS
	(silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 37.19 us = 0.02% latency, 1.76 GFLOPS)
	(linear): Linear(100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 234.84 us = 0.1% latency, 13.72 TFLOPS, in_features=4096, out_features=24576, bias=True)
	(norm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 463.96 us = 0.19% latency, 0 FLOPS)
	)
	(self_attn): DiTSelfAttention(
	50.33 M = 0.55% Params, 223.34 GMACs = 0.9% MACs, 3.05 ms = 1.24% latency, 146.58 TFLOPS
	(q_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 468.02 us = 0.19% latency, 0 FLOPS)
	(k_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 240.33 us = 0.1% latency, 0 FLOPS)
	(q_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 349.76 us = 0.14% latency, 392.95 TFLOPS, in_features=4096, out_features=4096, bias=False)
	(k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 198.36 us = 0.08% latency, 173.22 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 196.93 us = 0.08% latency, 174.47 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(text_k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 158.31 us = 0.06% latency, 217.04 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(text_v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 154.97 us = 0.06% latency, 221.72 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(o_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 344.04 us = 0.14% latency, 399.49 TFLOPS, in_features=4096, out_features=4096, bias=False)
	)
	(post_attention_layernorm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 458.48 us = 0.19% latency, 0 FLOPS)
	(mlp): GemmaMLP(
	134.38 M = 1.46% Params, 550.43 GMACs = 2.21% MACs, 2.15 ms = 0.87% latency, 512.71 TFLOPS
	(gate_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 628.47 us = 0.26% latency, 583.88 TFLOPS, in_features=4096, out_features=10936, bias=False)
	(up_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 624.42 us = 0.25% latency, 587.67 TFLOPS, in_features=4096, out_features=10936, bias=False)
	(down_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 589.61 us = 0.24% latency, 622.36 TFLOPS, in_features=10936, out_features=4096, bias=False)
	(act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 101.57 us = 0.04% latency, 441.03 GFLOPS)
	)
	)
	(21): DiTLayer(
	285.41 M = 3.1% Params, 775.38 GMACs = 3.11% MACs, 7.23 ms = 2.94% latency, 214.63 TFLOPS
	(input_layernorm): AdaLayerNormZero(
	100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 953.91 us = 0.39% latency, 3.38 TFLOPS
	(silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 41.48 us = 0.02% latency, 1.58 GFLOPS)
	(linear): Linear(100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 238.42 us = 0.1% latency, 13.51 TFLOPS, in_features=4096, out_features=24576, bias=True)
	(norm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 465.87 us = 0.19% latency, 0 FLOPS)
	)
	(self_attn): DiTSelfAttention(
	50.33 M = 0.55% Params, 223.34 GMACs = 0.9% MACs, 3.02 ms = 1.23% latency, 147.75 TFLOPS
	(q_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 465.87 us = 0.19% latency, 0 FLOPS)
	(k_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 236.51 us = 0.1% latency, 0 FLOPS)
	(q_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 349.52 us = 0.14% latency, 393.22 TFLOPS, in_features=4096, out_features=4096, bias=False)
	(k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 197.65 us = 0.08% latency, 173.84 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 194.55 us = 0.08% latency, 176.61 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(text_k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 158.07 us = 0.06% latency, 217.37 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(text_v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 155.21 us = 0.06% latency, 221.38 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(o_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 338.55 us = 0.14% latency, 405.96 TFLOPS, in_features=4096, out_features=4096, bias=False)
	)
	(post_attention_layernorm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 457.53 us = 0.19% latency, 0 FLOPS)
	(mlp): GemmaMLP(
	134.38 M = 1.46% Params, 550.43 GMACs = 2.21% MACs, 2.14 ms = 0.87% latency, 514.54 TFLOPS
	(gate_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 626.8 us = 0.25% latency, 585.43 TFLOPS, in_features=4096, out_features=10936, bias=False)
	(up_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 623.46 us = 0.25% latency, 588.57 TFLOPS, in_features=4096, out_features=10936, bias=False)
	(down_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 585.56 us = 0.24% latency, 626.67 TFLOPS, in_features=10936, out_features=4096, bias=False)
	(act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 100.85 us = 0.04% latency, 444.16 GFLOPS)
	)
	)
	(22): DiTLayer(
	285.41 M = 3.1% Params, 775.38 GMACs = 3.11% MACs, 7.2 ms = 2.93% latency, 215.32 TFLOPS
	(input_layernorm): AdaLayerNormZero(
	100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 940.08 us = 0.38% latency, 3.43 TFLOPS
	(silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 36.95 us = 0.02% latency, 1.77 GFLOPS)
	(linear): Linear(100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 237.46 us = 0.1% latency, 13.57 TFLOPS, in_features=4096, out_features=24576, bias=True)
	(norm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 459.19 us = 0.19% latency, 0 FLOPS)
	)
	(self_attn): DiTSelfAttention(
	50.33 M = 0.55% Params, 223.34 GMACs = 0.9% MACs, 3.02 ms = 1.23% latency, 148.01 TFLOPS
	(q_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 463.01 us = 0.19% latency, 0 FLOPS)
	(k_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 236.03 us = 0.1% latency, 0 FLOPS)
	(q_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 348.57 us = 0.14% latency, 394.3 TFLOPS, in_features=4096, out_features=4096, bias=False)
	(k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 198.13 us = 0.08% latency, 173.42 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 194.55 us = 0.08% latency, 176.61 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(text_k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 158.31 us = 0.06% latency, 217.04 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(text_v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 152.35 us = 0.06% latency, 225.53 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(o_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 336.89 us = 0.14% latency, 407.97 TFLOPS, in_features=4096, out_features=4096, bias=False)
	)
	(post_attention_layernorm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 457.05 us = 0.19% latency, 0 FLOPS)
	(mlp): GemmaMLP(
	134.38 M = 1.46% Params, 550.43 GMACs = 2.21% MACs, 2.14 ms = 0.87% latency, 515 TFLOPS
	(gate_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 627.28 us = 0.25% latency, 584.99 TFLOPS, in_features=4096, out_features=10936, bias=False)
	(up_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 624.66 us = 0.25% latency, 587.44 TFLOPS, in_features=4096, out_features=10936, bias=False)
	(down_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 584.6 us = 0.24% latency, 627.69 TFLOPS, in_features=10936, out_features=4096, bias=False)
	(act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 99.9 us = 0.04% latency, 448.4 GFLOPS)
	)
	)
	(23): DiTLayer(
	285.41 M = 3.1% Params, 775.38 GMACs = 3.11% MACs, 7.2 ms = 2.93% latency, 215.41 TFLOPS
	(input_layernorm): AdaLayerNormZero(
	100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 944.85 us = 0.38% latency, 3.41 TFLOPS
	(silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 36.95 us = 0.02% latency, 1.77 GFLOPS)
	(linear): Linear(100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 237.94 us = 0.1% latency, 13.54 TFLOPS, in_features=4096, out_features=24576, bias=True)
	(norm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 462.29 us = 0.19% latency, 0 FLOPS)
	)
	(self_attn): DiTSelfAttention(
	50.33 M = 0.55% Params, 223.34 GMACs = 0.9% MACs, 3.02 ms = 1.23% latency, 148.04 TFLOPS
	(q_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 465.39 us = 0.19% latency, 0 FLOPS)
	(k_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 237.7 us = 0.1% latency, 0 FLOPS)
	(q_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 349.04 us = 0.14% latency, 393.76 TFLOPS, in_features=4096, out_features=4096, bias=False)
	(k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 199.08 us = 0.08% latency, 172.59 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 195.03 us = 0.08% latency, 176.18 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(text_k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 158.79 us = 0.06% latency, 216.39 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(text_v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 156.4 us = 0.06% latency, 219.69 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(o_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 334.02 us = 0.14% latency, 411.46 TFLOPS, in_features=4096, out_features=4096, bias=False)
	)
	(post_attention_layernorm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 456.81 us = 0.19% latency, 0 FLOPS)
	(mlp): GemmaMLP(
	134.38 M = 1.46% Params, 550.43 GMACs = 2.21% MACs, 2.13 ms = 0.87% latency, 516.21 TFLOPS
	(gate_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 627.28 us = 0.25% latency, 584.99 TFLOPS, in_features=4096, out_features=10936, bias=False)
	(up_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 623.94 us = 0.25% latency, 588.12 TFLOPS, in_features=4096, out_features=10936, bias=False)
	(down_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 581.98 us = 0.24% latency, 630.52 TFLOPS, in_features=10936, out_features=4096, bias=False)
	(act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 100.61 us = 0.04% latency, 445.21 GFLOPS)
	)
	)
	(24): DiTLayer(
	285.41 M = 3.1% Params, 775.38 GMACs = 3.11% MACs, 7.23 ms = 2.94% latency, 214.37 TFLOPS
	(input_layernorm): AdaLayerNormZero(
	100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 962.5 us = 0.39% latency, 3.35 TFLOPS
	(silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 39.34 us = 0.02% latency, 1.67 GFLOPS)
	(linear): Linear(100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 241.99 us = 0.1% latency, 13.31 TFLOPS, in_features=4096, out_features=24576, bias=True)
	(norm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 462.53 us = 0.19% latency, 0 FLOPS)
	)
	(self_attn): DiTSelfAttention(
	50.33 M = 0.55% Params, 223.34 GMACs = 0.9% MACs, 3.02 ms = 1.23% latency, 147.81 TFLOPS
	(q_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 466.35 us = 0.19% latency, 0 FLOPS)
	(k_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 239.13 us = 0.1% latency, 0 FLOPS)
	(q_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 350 us = 0.14% latency, 392.68 TFLOPS, in_features=4096, out_features=4096, bias=False)
	(k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 199.08 us = 0.08% latency, 172.59 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 195.98 us = 0.08% latency, 175.32 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(text_k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 157.12 us = 0.06% latency, 218.69 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(text_v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 154.5 us = 0.06% latency, 222.4 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(o_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 334.02 us = 0.14% latency, 411.46 TFLOPS, in_features=4096, out_features=4096, bias=False)
	)
	(post_attention_layernorm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 457.76 us = 0.19% latency, 0 FLOPS)
	(mlp): GemmaMLP(
	134.38 M = 1.46% Params, 550.43 GMACs = 2.21% MACs, 2.14 ms = 0.87% latency, 514.43 TFLOPS
	(gate_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 627.52 us = 0.26% latency, 584.77 TFLOPS, in_features=4096, out_features=10936, bias=False)
	(up_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 623.94 us = 0.25% latency, 588.12 TFLOPS, in_features=4096, out_features=10936, bias=False)
	(down_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 584.6 us = 0.24% latency, 627.69 TFLOPS, in_features=10936, out_features=4096, bias=False)
	(act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 101.09 us = 0.04% latency, 443.11 GFLOPS)
	)
	)
	(25): DiTLayer(
	285.41 M = 3.1% Params, 775.38 GMACs = 3.11% MACs, 7.22 ms = 2.93% latency, 214.88 TFLOPS
	(input_layernorm): AdaLayerNormZero(
	100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 940.32 us = 0.38% latency, 3.43 TFLOPS
	(silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 36.48 us = 0.01% latency, 1.8 GFLOPS)
	(linear): Linear(100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 233.17 us = 0.09% latency, 13.81 TFLOPS, in_features=4096, out_features=24576, bias=True)
	(norm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 462.06 us = 0.19% latency, 0 FLOPS)
	)
	(self_attn): DiTSelfAttention(
	50.33 M = 0.55% Params, 223.34 GMACs = 0.9% MACs, 3.02 ms = 1.23% latency, 147.96 TFLOPS
	(q_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 464.44 us = 0.19% latency, 0 FLOPS)
	(k_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 242.95 us = 0.1% latency, 0 FLOPS)
	(q_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 349.04 us = 0.14% latency, 393.76 TFLOPS, in_features=4096, out_features=4096, bias=False)
	(k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 199.08 us = 0.08% latency, 172.59 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 195.98 us = 0.08% latency, 175.32 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(text_k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 157.12 us = 0.06% latency, 218.69 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(text_v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 153.3 us = 0.06% latency, 224.13 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(o_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 335.93 us = 0.14% latency, 409.13 TFLOPS, in_features=4096, out_features=4096, bias=False)
	)
	(post_attention_layernorm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 457.53 us = 0.19% latency, 0 FLOPS)
	(mlp): GemmaMLP(
	134.38 M = 1.46% Params, 550.43 GMACs = 2.21% MACs, 2.15 ms = 0.87% latency, 512.03 TFLOPS
	(gate_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 633.48 us = 0.26% latency, 579.26 TFLOPS, in_features=4096, out_features=10936, bias=False)
	(up_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 626.09 us = 0.25% latency, 586.1 TFLOPS, in_features=4096, out_features=10936, bias=False)
	(down_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 588.42 us = 0.24% latency, 623.62 TFLOPS, in_features=10936, out_features=4096, bias=False)
	(act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 100.37 us = 0.04% latency, 446.27 GFLOPS)
	)
	)
	(26): DiTLayer(
	285.41 M = 3.1% Params, 775.38 GMACs = 3.11% MACs, 7.31 ms = 2.97% latency, 212.01 TFLOPS
	(input_layernorm): AdaLayerNormZero(
	100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 1.03 ms = 0.42% latency, 3.13 TFLOPS
	(silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 36 us = 0.01% latency, 1.82 GFLOPS)
	(linear): Linear(100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 235.8 us = 0.1% latency, 13.66 TFLOPS, in_features=4096, out_features=24576, bias=True)
	(norm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 464.68 us = 0.19% latency, 0 FLOPS)
	)
	(self_attn): DiTSelfAttention(
	50.33 M = 0.55% Params, 223.34 GMACs = 0.9% MACs, 3.03 ms = 1.23% latency, 147.59 TFLOPS
	(q_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 468.02 us = 0.19% latency, 0 FLOPS)
	(k_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 237.23 us = 0.1% latency, 0 FLOPS)
	(q_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 360.01 us = 0.15% latency, 381.76 TFLOPS, in_features=4096, out_features=4096, bias=False)
	(k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 197.89 us = 0.08% latency, 173.63 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 194.55 us = 0.08% latency, 176.61 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(text_k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 156.64 us = 0.06% latency, 219.35 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(text_v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 153.54 us = 0.06% latency, 223.78 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(o_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 335.93 us = 0.14% latency, 409.13 TFLOPS, in_features=4096, out_features=4096, bias=False)
	)
	(post_attention_layernorm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 460.15 us = 0.19% latency, 0 FLOPS)
	(mlp): GemmaMLP(
	134.38 M = 1.46% Params, 550.43 GMACs = 2.21% MACs, 2.15 ms = 0.87% latency, 512.43 TFLOPS
	(gate_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 629.43 us = 0.26% latency, 582.99 TFLOPS, in_features=4096, out_features=10936, bias=False)
	(up_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 626.33 us = 0.25% latency, 585.88 TFLOPS, in_features=4096, out_features=10936, bias=False)
	(down_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 588.89 us = 0.24% latency, 623.12 TFLOPS, in_features=10936, out_features=4096, bias=False)
	(act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 100.85 us = 0.04% latency, 444.16 GFLOPS)
	)
	)
	(27): DiTLayer(
	285.41 M = 3.1% Params, 775.38 GMACs = 3.11% MACs, 9.77 ms = 3.97% latency, 158.72 TFLOPS
	(input_layernorm): AdaLayerNormZero(
	100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 991.82 us = 0.4% latency, 3.25 TFLOPS
	(silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 83.68 us = 0.03% latency, 783.13 MFLOPS)
	(linear): Linear(100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 236.75 us = 0.1% latency, 13.61 TFLOPS, in_features=4096, out_features=24576, bias=True)
	(norm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 463.49 us = 0.19% latency, 0 FLOPS)
	)
	(self_attn): DiTSelfAttention(
	50.33 M = 0.55% Params, 223.34 GMACs = 0.9% MACs, 3.01 ms = 1.22% latency, 148.24 TFLOPS
	(q_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 464.2 us = 0.19% latency, 0 FLOPS)
	(k_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 238.42 us = 0.1% latency, 0 FLOPS)
	(q_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 348.33 us = 0.14% latency, 394.57 TFLOPS, in_features=4096, out_features=4096, bias=False)
	(k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 197.89 us = 0.08% latency, 173.63 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 195.26 us = 0.08% latency, 175.96 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(text_k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 156.64 us = 0.06% latency, 219.35 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(text_v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 154.26 us = 0.06% latency, 222.74 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(o_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 336.41 us = 0.14% latency, 408.55 TFLOPS, in_features=4096, out_features=4096, bias=False)
	)
	(post_attention_layernorm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 458.72 us = 0.19% latency, 0 FLOPS)
	(mlp): GemmaMLP(
	134.38 M = 1.46% Params, 550.43 GMACs = 2.21% MACs, 4.59 ms = 1.87% latency, 239.65 TFLOPS
	(gate_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 629.66 us = 0.26% latency, 582.77 TFLOPS, in_features=4096, out_features=10936, bias=False)
	(up_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 884.77 us = 0.36% latency, 414.74 TFLOPS, in_features=4096, out_features=10936, bias=False)
	(down_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 599.62 us = 0.24% latency, 611.97 TFLOPS, in_features=10936, out_features=4096, bias=False)
	(act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 1.97 ms = 0.8% latency, 22.75 GFLOPS)
	)
	)
	(28): DiTLayer(
	285.41 M = 3.1% Params, 775.38 GMACs = 3.11% MACs, 7.41 ms = 3.01% latency, 209.21 TFLOPS
	(input_layernorm): AdaLayerNormZero(
	100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 982.52 us = 0.4% latency, 3.28 TFLOPS
	(silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 41.01 us = 0.02% latency, 1.6 GFLOPS)
	(linear): Linear(100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 244.14 us = 0.1% latency, 13.19 TFLOPS, in_features=4096, out_features=24576, bias=True)
	(norm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 469.21 us = 0.19% latency, 0 FLOPS)
	)
	(self_attn): DiTSelfAttention(
	50.33 M = 0.55% Params, 223.34 GMACs = 0.9% MACs, 3.09 ms = 1.25% latency, 144.71 TFLOPS
	(q_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 464.92 us = 0.19% latency, 0 FLOPS)
	(k_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 260.35 us = 0.11% latency, 0 FLOPS)
	(q_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 366.21 us = 0.15% latency, 375.3 TFLOPS, in_features=4096, out_features=4096, bias=False)
	(k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 218.39 us = 0.09% latency, 157.33 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 196.7 us = 0.08% latency, 174.69 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(text_k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 156.4 us = 0.06% latency, 219.69 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(text_v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 153.06 us = 0.06% latency, 224.48 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(o_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 339.03 us = 0.14% latency, 405.39 TFLOPS, in_features=4096, out_features=4096, bias=False)
	)
	(post_attention_layernorm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 457.29 us = 0.19% latency, 0 FLOPS)
	(mlp): GemmaMLP(
	134.38 M = 1.46% Params, 550.43 GMACs = 2.21% MACs, 2.2 ms = 0.89% latency, 501.25 TFLOPS
	(gate_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 640.87 us = 0.26% latency, 572.58 TFLOPS, in_features=4096, out_features=10936, bias=False)
	(up_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 626.8 us = 0.25% latency, 585.43 TFLOPS, in_features=4096, out_features=10936, bias=False)
	(down_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 588.66 us = 0.24% latency, 623.37 TFLOPS, in_features=10936, out_features=4096, bias=False)
	(act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 114.92 us = 0.05% latency, 389.79 GFLOPS)
	)
	)
	(29): DiTLayer(
	285.41 M = 3.1% Params, 775.38 GMACs = 3.11% MACs, 7.29 ms = 2.96% latency, 212.8 TFLOPS
	(input_layernorm): AdaLayerNormZero(
	100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 957.73 us = 0.39% latency, 3.36 TFLOPS
	(silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 40.05 us = 0.02% latency, 1.64 GFLOPS)
	(linear): Linear(100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 244.14 us = 0.1% latency, 13.19 TFLOPS, in_features=4096, out_features=24576, bias=True)
	(norm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 463.25 us = 0.19% latency, 0 FLOPS)
	)
	(self_attn): DiTSelfAttention(
	50.33 M = 0.55% Params, 223.34 GMACs = 0.9% MACs, 3.05 ms = 1.24% latency, 146.32 TFLOPS
	(q_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 463.49 us = 0.19% latency, 0 FLOPS)
	(k_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 236.75 us = 0.1% latency, 0 FLOPS)
	(q_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 351.19 us = 0.14% latency, 391.35 TFLOPS, in_features=4096, out_features=4096, bias=False)
	(k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 199.32 us = 0.08% latency, 172.39 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 194.55 us = 0.08% latency, 176.61 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(text_k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 156.16 us = 0.06% latency, 220.02 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(text_v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 175.95 us = 0.07% latency, 195.28 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(o_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 355.96 us = 0.14% latency, 386.11 TFLOPS, in_features=4096, out_features=4096, bias=False)
	)
	(post_attention_layernorm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 457.76 us = 0.19% latency, 0 FLOPS)
	(mlp): GemmaMLP(
	134.38 M = 1.46% Params, 550.43 GMACs = 2.21% MACs, 2.16 ms = 0.88% latency, 509.32 TFLOPS
	(gate_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 643.97 us = 0.26% latency, 569.83 TFLOPS, in_features=4096, out_features=10936, bias=False)
	(up_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 625.37 us = 0.25% latency, 586.77 TFLOPS, in_features=4096, out_features=10936, bias=False)
	(down_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 589.37 us = 0.24% latency, 622.62 TFLOPS, in_features=10936, out_features=4096, bias=False)
	(act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 100.85 us = 0.04% latency, 444.16 GFLOPS)
	)
	)
	(30): DiTLayer(
	285.41 M = 3.1% Params, 775.38 GMACs = 3.11% MACs, 7.26 ms = 2.95% latency, 213.5 TFLOPS
	(input_layernorm): AdaLayerNormZero(
	100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 937.22 us = 0.38% latency, 3.44 TFLOPS
	(silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 39.58 us = 0.02% latency, 1.66 GFLOPS)
	(linear): Linear(100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 231.74 us = 0.09% latency, 13.9 TFLOPS, in_features=4096, out_features=24576, bias=True)
	(norm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 459.91 us = 0.19% latency, 0 FLOPS)
	)
	(self_attn): DiTSelfAttention(
	50.33 M = 0.55% Params, 223.34 GMACs = 0.9% MACs, 3.05 ms = 1.24% latency, 146.45 TFLOPS
	(q_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 464.68 us = 0.19% latency, 0 FLOPS)
	(k_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 238.18 us = 0.1% latency, 0 FLOPS)
	(q_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 350 us = 0.14% latency, 392.68 TFLOPS, in_features=4096, out_features=4096, bias=False)
	(k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 195.98 us = 0.08% latency, 175.32 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 214.82 us = 0.09% latency, 159.95 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(text_k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 157.83 us = 0.06% latency, 217.7 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(text_v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 168.56 us = 0.07% latency, 203.84 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(o_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 338.55 us = 0.14% latency, 405.96 TFLOPS, in_features=4096, out_features=4096, bias=False)
	)
	(post_attention_layernorm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 458.24 us = 0.19% latency, 0 FLOPS)
	(mlp): GemmaMLP(
	134.38 M = 1.46% Params, 550.43 GMACs = 2.21% MACs, 2.17 ms = 0.88% latency, 508.14 TFLOPS
	(gate_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 649.45 us = 0.26% latency, 565.02 TFLOPS, in_features=4096, out_features=10936, bias=False)
	(up_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 626.56 us = 0.25% latency, 585.66 TFLOPS, in_features=4096, out_features=10936, bias=False)
	(down_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 587.7 us = 0.24% latency, 624.38 TFLOPS, in_features=10936, out_features=4096, bias=False)
	(act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 101.57 us = 0.04% latency, 441.03 GFLOPS)
	)
	)
	(31): DiTLayer(
	285.41 M = 3.1% Params, 775.38 GMACs = 3.11% MACs, 7.25 ms = 2.95% latency, 213.94 TFLOPS
	(input_layernorm): AdaLayerNormZero(
	100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 940.56 us = 0.38% latency, 3.42 TFLOPS
	(silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 37.19 us = 0.02% latency, 1.76 GFLOPS)
	(linear): Linear(100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 235.08 us = 0.1% latency, 13.7 TFLOPS, in_features=4096, out_features=24576, bias=True)
	(norm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 462.29 us = 0.19% latency, 0 FLOPS)
	)
	(self_attn): DiTSelfAttention(
	50.33 M = 0.55% Params, 223.34 GMACs = 0.9% MACs, 3.03 ms = 1.23% latency, 147.55 TFLOPS
	(q_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 466.11 us = 0.19% latency, 0 FLOPS)
	(k_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 235.32 us = 0.1% latency, 0 FLOPS)
	(q_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 347.38 us = 0.14% latency, 395.65 TFLOPS, in_features=4096, out_features=4096, bias=False)
	(k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 216.01 us = 0.09% latency, 159.07 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 194.31 us = 0.08% latency, 176.83 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(text_k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 156.16 us = 0.06% latency, 220.02 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(text_v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 154.5 us = 0.06% latency, 222.4 TFLOPS, in_features=4096, out_features=1024, bias=False)
	(o_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 340.46 us = 0.14% latency, 403.68 TFLOPS, in_features=4096, out_features=4096, bias=False)
	)
	(post_attention_layernorm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 456.81 us = 0.19% latency, 0 FLOPS)
	(mlp): GemmaMLP(
	134.38 M = 1.46% Params, 550.43 GMACs = 2.21% MACs, 2.17 ms = 0.88% latency, 508.42 TFLOPS
	(gate_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 641.82 us = 0.26% latency, 571.73 TFLOPS, in_features=4096, out_features=10936, bias=False)
	(up_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 624.9 us = 0.25% latency, 587.22 TFLOPS, in_features=4096, out_features=10936, bias=False)
	(down_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 581.98 us = 0.24% latency, 630.52 TFLOPS, in_features=10936, out_features=4096, bias=False)
	(act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 100.85 us = 0.04% latency, 444.16 GFLOPS)
	)
	)
	)
	(patch_embed): PatchEmbed(
	266.24 K = 0% Params, 1.07 GMACs = 0% MACs, 627.04 us = 0.25% latency, 3.45 TFLOPS
	(proj): Conv2d(266.24 K = 0% Params, 1.07 GMACs = 0% MACs, 391.01 us = 0.16% latency, 5.54 TFLOPS, 16, 4096, kernel_size=(2, 2), stride=(2, 2))
	)
	(rotary_emb): GemmaRotaryEmbedding(0 = 0% Params, 0 MACs = 0% MACs, 0 s = 0% latency, 0 FLOPS)
	(time_proj): Timesteps(0 = 0% Params, 0 MACs = 0% MACs, 261.78 us = 0.11% latency, 0 FLOPS)
	(timestep_embedder): Sequential(
	17.83 M = 0.19% Params, 285.21 MMACs = 0% MACs, 520.94 us = 0.21% latency, 1.1 TFLOPS
	(0): Linear(1.05 M = 0.01% Params, 16.78 MMACs = 0% MACs, 221.73 us = 0.09% latency, 151.33 GFLOPS, in_features=256, out_features=4096, bias=True)
	(1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 41.96 us = 0.02% latency, 1.56 GFLOPS)
	(2): Linear(16.78 M = 0.18% Params, 268.44 MMACs = 0% MACs, 184.54 us = 0.07% latency, 2.91 TFLOPS, in_features=4096, out_features=4096, bias=True)
	)
	(context_embedder): Sequential(
	8.39 M = 0.09% Params, 34.36 GMACs = 0.14% MACs, 499.01 us = 0.2% latency, 137.71 TFLOPS
	(0): GemmaRMSNorm(2.05 K = 0% Params, 0 MACs = 0% MACs, 178.81 us = 0.07% latency, 0 FLOPS)
	(1): Linear(8.39 M = 0.09% Params, 34.36 GMACs = 0.14% MACs, 267.27 us = 0.11% latency, 257.12 TFLOPS, in_features=2048, out_features=4096, bias=True)
	)
	(norm_out): AdaLayerNormOut(
	33.57 M = 0.36% Params, 536.87 MMACs = 0% MACs, 921.01 us = 0.37% latency, 1.17 TFLOPS
	(silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 51.5 us = 0.02% latency, 1.27 GFLOPS)
	(linear): Linear(33.56 M = 0.36% Params, 536.87 MMACs = 0% MACs, 197.89 us = 0.08% latency, 5.43 TFLOPS, in_features=4096, out_features=8192, bias=True)
	(norm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 461.1 us = 0.19% latency, 0 FLOPS)
	)
	(proj_out): Linear(262.21 K = 0% Params, 1.07 GMACs = 0% MACs, 205.99 us = 0.08% latency, 10.42 TFLOPS, in_features=4096, out_features=64, bias=True)
	(repa_projector): Sequential(
	14.16 M = 0.15% Params, 57.98 GMACs = 0.23% MACs, 774.15 us = 0.31% latency, 149.82 TFLOPS
	(0): Linear(8.39 M = 0.09% Params, 34.36 GMACs = 0.14% MACs, 276.33 us = 0.11% latency, 248.69 TFLOPS, in_features=4096, out_features=2048, bias=True)
	(1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 33.62 us = 0.01% latency, 249.53 GFLOPS)
	(2): Linear(4.2 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 185.73 us = 0.08% latency, 185 TFLOPS, in_features=2048, out_features=2048, bias=True)
	(3): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 30.04 us = 0.01% latency, 279.24 GFLOPS)
	(4): Linear(1.57 M = 0.02% Params, 6.44 GMACs = 0.03% MACs, 144.48 us = 0.06% latency, 89.18 TFLOPS, in_features=2048, out_features=768, bias=True)
	)
	)
	------------------------------------------------------------------------------