Add files using upload-large-folder tool

68b99d9 verified 27 days ago

96.2 kB


	-------------------------- DeepSpeed Flops Profiler --------------------------
	Profile Summary at step 2:
	Notations:
	data parallel size (dp_size), model parallel size(mp_size),
	number of parameters (params), number of multiply-accumulate operations(MACs),
	number of floating-point operations (flops), floating-point operations per second (FLOPS),
	fwd latency (forward propagation latency), bwd latency (backward propagation latency),
	step (weights update latency), iter latency (sum of fwd, bwd and step latency)

	world size: 32
	data parallel size: 32
	model parallel size: 1
	batch size per GPU: 16
	params per GPU: 8.08 B
	params of model = params per GPU * mp_size: 8.08 B
	fwd MACs per GPU: 21.86 TMACs
	fwd flops per GPU: 43.71 T
	fwd flops of model = fwd flops per GPU * mp_size: 43.71 T
	fwd latency: 230.07 ms
	fwd FLOPS per GPU = fwd flops per GPU / fwd latency: 190 TFLOPS
	bwd latency: 848.67 ms
	bwd FLOPS per GPU = 2 * fwd flops per GPU / bwd latency: 103.02 TFLOPS
	fwd+bwd FLOPS per GPU = 3 * fwd flops per GPU / (fwd+bwd latency): 121.57 TFLOPS
	step latency: 387.21 ms
	iter latency: 1.47 s
	FLOPS per GPU = 3 * fwd flops per GPU / iter latency: 89.46 TFLOPS
	samples/second: 349.26

	----------------------------- Aggregated Profile per GPU -----------------------------
	Top 1 modules in terms of params, MACs or fwd latency at different model depths:
	depth 0:
	params - {'DiT': '8.08 B'}
	MACs - {'DiT': '21.86 TMACs'}
	fwd latency - {'DiT': '229.91 ms'}
	depth 1:
	params - {'ModuleList': '8.02 B'}
	MACs - {'ModuleList': '21.82 TMACs'}
	fwd latency - {'ModuleList': '219.96 ms'}
	depth 2:
	params - {'DiTLayer': '8.02 B'}
	MACs - {'DiTLayer': '21.82 TMACs'}
	fwd latency - {'DiTLayer': '219.96 ms'}
	depth 3:
	params - {'GemmaMLP': '3.77 B'}
	MACs - {'GemmaMLP': '15.46 TMACs'}
	fwd latency - {'DiTSelfAttention': '93.22 ms'}

	------------------------------ Detailed Profile per GPU ------------------------------
	Each module profile is listed after its name in the following order:
	params, percentage of total params, MACs, percentage of total MACs, fwd latency, percentage of total fwd latency, fwd FLOPS

	Note: 1. A module can have torch.nn.module or torch.nn.functional to compute logits (e.g. CrossEntropyLoss). They are not counted as submodules, thus not to be printed out. However they make up the difference between a parent's MACs (or latency) and the sum of its submodules'.
	2. Number of floating-point operations is a theoretical estimation, thus FLOPS computed using that could be larger than the maximum system throughput.
	3. The fwd latency listed in the top module's profile is directly captured at the module forward function in PyTorch, thus it's less than the fwd latency shown above which is captured in DeepSpeed.

	DiT(
	8.08 B = 100% Params, 21.86 TMACs = 100% MACs, 229.91 ms = 100% latency, 190.13 TFLOPS
	(layers): ModuleList(
	(0): DiTLayer(
	250.71 M = 3.1% Params, 681.9 GMACs = 3.12% MACs, 6.98 ms = 3.03% latency, 195.53 TFLOPS
	(input_layernorm): AdaLayerNormZero(
	88.5 M = 1.1% Params, 1.42 GMACs = 0.01% MACs, 940.56 us = 0.41% latency, 3.01 TFLOPS
	(silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 38.62 us = 0.02% latency, 1.59 GFLOPS)
	(linear): Linear(88.5 M = 1.1% Params, 1.42 GMACs = 0.01% MACs, 251.29 us = 0.11% latency, 11.27 TFLOPS, in_features=3840, out_features=23040, bias=True)
	(norm): GemmaRMSNorm(3.84 K = 0% Params, 0 MACs = 0% MACs, 432.49 us = 0.19% latency, 0 FLOPS)
	)
	(self_attn): DiTSelfAttention(
	44.24 M = 0.55% Params, 197.3 GMACs = 0.9% MACs, 2.96 ms = 1.29% latency, 133.4 TFLOPS
	(q_norm): GemmaRMSNorm(120 = 0% Params, 0 MACs = 0% MACs, 442.98 us = 0.19% latency, 0 FLOPS)
	(k_norm): GemmaRMSNorm(120 = 0% Params, 0 MACs = 0% MACs, 228.88 us = 0.1% latency, 0 FLOPS)
	(q_proj): Linear(14.75 M = 0.18% Params, 60.4 GMACs = 0.28% MACs, 339.03 us = 0.15% latency, 356.3 TFLOPS, in_features=3840, out_features=3840, bias=False)
	(k_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 193.12 us = 0.08% latency, 156.37 TFLOPS, in_features=3840, out_features=960, bias=False)
	(v_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 189.3 us = 0.08% latency, 159.53 TFLOPS, in_features=3840, out_features=960, bias=False)
	(text_k_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 154.97 us = 0.07% latency, 194.87 TFLOPS, in_features=3840, out_features=960, bias=False)
	(text_v_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 158.55 us = 0.07% latency, 190.47 TFLOPS, in_features=3840, out_features=960, bias=False)
	(o_proj): Linear(14.75 M = 0.18% Params, 60.4 GMACs = 0.28% MACs, 314.47 us = 0.14% latency, 384.12 TFLOPS, in_features=3840, out_features=3840, bias=False)
	)
	(post_attention_layernorm): GemmaRMSNorm(3.84 K = 0% Params, 0 MACs = 0% MACs, 428.44 us = 0.19% latency, 0 FLOPS)
	(mlp): GemmaMLP(
	117.96 M = 1.46% Params, 483.18 GMACs = 2.21% MACs, 2.02 ms = 0.88% latency, 479.47 TFLOPS
	(gate_proj): Linear(39.32 M = 0.49% Params, 161.06 GMACs = 0.74% MACs, 587.7 us = 0.26% latency, 548.11 TFLOPS, in_features=3840, out_features=10240, bias=False)
	(up_proj): Linear(39.32 M = 0.49% Params, 161.06 GMACs = 0.74% MACs, 585.79 us = 0.25% latency, 549.89 TFLOPS, in_features=3840, out_features=10240, bias=False)
	(down_proj): Linear(39.32 M = 0.49% Params, 161.06 GMACs = 0.74% MACs, 545.5 us = 0.24% latency, 590.51 TFLOPS, in_features=10240, out_features=3840, bias=False)
	(act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 99.66 us = 0.04% latency, 420.87 GFLOPS)
	)
	)
	(1): DiTLayer(
	250.71 M = 3.1% Params, 681.9 GMACs = 3.12% MACs, 6.83 ms = 2.97% latency, 199.73 TFLOPS
	(input_layernorm): AdaLayerNormZero(
	88.5 M = 1.1% Params, 1.42 GMACs = 0.01% MACs, 891.69 us = 0.39% latency, 3.18 TFLOPS
	(silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 34.57 us = 0.02% latency, 1.78 GFLOPS)
	(linear): Linear(88.5 M = 1.1% Params, 1.42 GMACs = 0.01% MACs, 224.11 us = 0.1% latency, 12.63 TFLOPS, in_features=3840, out_features=23040, bias=True)
	(norm): GemmaRMSNorm(3.84 K = 0% Params, 0 MACs = 0% MACs, 432.25 us = 0.19% latency, 0 FLOPS)
	)
	(self_attn): DiTSelfAttention(
	44.24 M = 0.55% Params, 197.3 GMACs = 0.9% MACs, 2.9 ms = 1.26% latency, 136.2 TFLOPS
	(q_norm): GemmaRMSNorm(120 = 0% Params, 0 MACs = 0% MACs, 441.31 us = 0.19% latency, 0 FLOPS)
	(k_norm): GemmaRMSNorm(120 = 0% Params, 0 MACs = 0% MACs, 226.97 us = 0.1% latency, 0 FLOPS)
	(q_proj): Linear(14.75 M = 0.18% Params, 60.4 GMACs = 0.28% MACs, 319 us = 0.14% latency, 378.67 TFLOPS, in_features=3840, out_features=3840, bias=False)
	(k_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 192.17 us = 0.08% latency, 157.15 TFLOPS, in_features=3840, out_features=960, bias=False)
	(v_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 188.59 us = 0.08% latency, 160.13 TFLOPS, in_features=3840, out_features=960, bias=False)
	(text_k_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 152.83 us = 0.07% latency, 197.6 TFLOPS, in_features=3840, out_features=960, bias=False)
	(text_v_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 150.2 us = 0.07% latency, 201.05 TFLOPS, in_features=3840, out_features=960, bias=False)
	(o_proj): Linear(14.75 M = 0.18% Params, 60.4 GMACs = 0.28% MACs, 305.65 us = 0.13% latency, 395.21 TFLOPS, in_features=3840, out_features=3840, bias=False)
	)
	(post_attention_layernorm): GemmaRMSNorm(3.84 K = 0% Params, 0 MACs = 0% MACs, 429.39 us = 0.19% latency, 0 FLOPS)
	(mlp): GemmaMLP(
	117.96 M = 1.46% Params, 483.18 GMACs = 2.21% MACs, 1.99 ms = 0.87% latency, 484.45 TFLOPS
	(gate_proj): Linear(39.32 M = 0.49% Params, 161.06 GMACs = 0.74% MACs, 584.6 us = 0.25% latency, 551.01 TFLOPS, in_features=3840, out_features=10240, bias=False)
	(up_proj): Linear(39.32 M = 0.49% Params, 161.06 GMACs = 0.74% MACs, 581.74 us = 0.25% latency, 553.72 TFLOPS, in_features=3840, out_features=10240, bias=False)
	(down_proj): Linear(39.32 M = 0.49% Params, 161.06 GMACs = 0.74% MACs, 540.49 us = 0.24% latency, 595.98 TFLOPS, in_features=10240, out_features=3840, bias=False)
	(act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 95.84 us = 0.04% latency, 437.62 GFLOPS)
	)
	)
	(2): DiTLayer(
	250.71 M = 3.1% Params, 681.9 GMACs = 3.12% MACs, 6.84 ms = 2.98% latency, 199.4 TFLOPS
	(input_layernorm): AdaLayerNormZero(
	88.5 M = 1.1% Params, 1.42 GMACs = 0.01% MACs, 898.84 us = 0.39% latency, 3.15 TFLOPS
	(silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 35.52 us = 0.02% latency, 1.73 GFLOPS)
	(linear): Linear(88.5 M = 1.1% Params, 1.42 GMACs = 0.01% MACs, 223.4 us = 0.1% latency, 12.67 TFLOPS, in_features=3840, out_features=23040, bias=True)
	(norm): GemmaRMSNorm(3.84 K = 0% Params, 0 MACs = 0% MACs, 436.07 us = 0.19% latency, 0 FLOPS)
	)
	(self_attn): DiTSelfAttention(
	44.24 M = 0.55% Params, 197.3 GMACs = 0.9% MACs, 2.9 ms = 1.26% latency, 136.22 TFLOPS
	(q_norm): GemmaRMSNorm(120 = 0% Params, 0 MACs = 0% MACs, 441.31 us = 0.19% latency, 0 FLOPS)
	(k_norm): GemmaRMSNorm(120 = 0% Params, 0 MACs = 0% MACs, 227.93 us = 0.1% latency, 0 FLOPS)
	(q_proj): Linear(14.75 M = 0.18% Params, 60.4 GMACs = 0.28% MACs, 319.48 us = 0.14% latency, 378.1 TFLOPS, in_features=3840, out_features=3840, bias=False)
	(k_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 191.45 us = 0.08% latency, 157.74 TFLOPS, in_features=3840, out_features=960, bias=False)
	(v_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 188.35 us = 0.08% latency, 160.33 TFLOPS, in_features=3840, out_features=960, bias=False)
	(text_k_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 152.59 us = 0.07% latency, 197.91 TFLOPS, in_features=3840, out_features=960, bias=False)
	(text_v_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 151.63 us = 0.07% latency, 199.16 TFLOPS, in_features=3840, out_features=960, bias=False)
	(o_proj): Linear(14.75 M = 0.18% Params, 60.4 GMACs = 0.28% MACs, 303.98 us = 0.13% latency, 397.38 TFLOPS, in_features=3840, out_features=3840, bias=False)
	)
	(post_attention_layernorm): GemmaRMSNorm(3.84 K = 0% Params, 0 MACs = 0% MACs, 430.11 us = 0.19% latency, 0 FLOPS)
	(mlp): GemmaMLP(
	117.96 M = 1.46% Params, 483.18 GMACs = 2.21% MACs, 2 ms = 0.87% latency, 483.3 TFLOPS
	(gate_proj): Linear(39.32 M = 0.49% Params, 161.06 GMACs = 0.74% MACs, 586.03 us = 0.25% latency, 549.67 TFLOPS, in_features=3840, out_features=10240, bias=False)
	(up_proj): Linear(39.32 M = 0.49% Params, 161.06 GMACs = 0.74% MACs, 584.13 us = 0.25% latency, 551.46 TFLOPS, in_features=3840, out_features=10240, bias=False)
	(down_proj): Linear(39.32 M = 0.49% Params, 161.06 GMACs = 0.74% MACs, 542.16 us = 0.24% latency, 594.14 TFLOPS, in_features=10240, out_features=3840, bias=False)
	(act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 95.37 us = 0.04% latency, 439.8 GFLOPS)
	)
	)
	(3): DiTLayer(
	250.71 M = 3.1% Params, 681.9 GMACs = 3.12% MACs, 6.84 ms = 2.97% latency, 199.51 TFLOPS
	(input_layernorm): AdaLayerNormZero(
	88.5 M = 1.1% Params, 1.42 GMACs = 0.01% MACs, 902.18 us = 0.39% latency, 3.14 TFLOPS
	(silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 37.19 us = 0.02% latency, 1.65 GFLOPS)
	(linear): Linear(88.5 M = 1.1% Params, 1.42 GMACs = 0.01% MACs, 230.55 us = 0.1% latency, 12.28 TFLOPS, in_features=3840, out_features=23040, bias=True)
	(norm): GemmaRMSNorm(3.84 K = 0% Params, 0 MACs = 0% MACs, 434.4 us = 0.19% latency, 0 FLOPS)
	)
	(self_attn): DiTSelfAttention(
	44.24 M = 0.55% Params, 197.3 GMACs = 0.9% MACs, 2.89 ms = 1.26% latency, 136.51 TFLOPS
	(q_norm): GemmaRMSNorm(120 = 0% Params, 0 MACs = 0% MACs, 441.55 us = 0.19% latency, 0 FLOPS)
	(k_norm): GemmaRMSNorm(120 = 0% Params, 0 MACs = 0% MACs, 227.69 us = 0.1% latency, 0 FLOPS)
	(q_proj): Linear(14.75 M = 0.18% Params, 60.4 GMACs = 0.28% MACs, 318.53 us = 0.14% latency, 379.23 TFLOPS, in_features=3840, out_features=3840, bias=False)
	(k_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 192.4 us = 0.08% latency, 156.96 TFLOPS, in_features=3840, out_features=960, bias=False)
	(v_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 188.83 us = 0.08% latency, 159.93 TFLOPS, in_features=3840, out_features=960, bias=False)
	(text_k_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 152.83 us = 0.07% latency, 197.6 TFLOPS, in_features=3840, out_features=960, bias=False)
	(text_v_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 149.97 us = 0.07% latency, 201.37 TFLOPS, in_features=3840, out_features=960, bias=False)
	(o_proj): Linear(14.75 M = 0.18% Params, 60.4 GMACs = 0.28% MACs, 301.6 us = 0.13% latency, 400.52 TFLOPS, in_features=3840, out_features=3840, bias=False)
	)
	(post_attention_layernorm): GemmaRMSNorm(3.84 K = 0% Params, 0 MACs = 0% MACs, 429.63 us = 0.19% latency, 0 FLOPS)
	(mlp): GemmaMLP(
	117.96 M = 1.46% Params, 483.18 GMACs = 2.21% MACs, 2 ms = 0.87% latency, 484.05 TFLOPS
	(gate_proj): Linear(39.32 M = 0.49% Params, 161.06 GMACs = 0.74% MACs, 582.93 us = 0.25% latency, 552.59 TFLOPS, in_features=3840, out_features=10240, bias=False)
	(up_proj): Linear(39.32 M = 0.49% Params, 161.06 GMACs = 0.74% MACs, 580.79 us = 0.25% latency, 554.63 TFLOPS, in_features=3840, out_features=10240, bias=False)
	(down_proj): Linear(39.32 M = 0.49% Params, 161.06 GMACs = 0.74% MACs, 538.59 us = 0.23% latency, 598.09 TFLOPS, in_features=10240, out_features=3840, bias=False)
	(act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 102.28 us = 0.04% latency, 410.07 GFLOPS)
	)
	)
	(4): DiTLayer(
	250.71 M = 3.1% Params, 681.9 GMACs = 3.12% MACs, 6.85 ms = 2.98% latency, 199.09 TFLOPS
	(input_layernorm): AdaLayerNormZero(
	88.5 M = 1.1% Params, 1.42 GMACs = 0.01% MACs, 887.63 us = 0.39% latency, 3.19 TFLOPS
	(silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 36.24 us = 0.02% latency, 1.7 GFLOPS)
	(linear): Linear(88.5 M = 1.1% Params, 1.42 GMACs = 0.01% MACs, 219.35 us = 0.1% latency, 12.91 TFLOPS, in_features=3840, out_features=23040, bias=True)
	(norm): GemmaRMSNorm(3.84 K = 0% Params, 0 MACs = 0% MACs, 431.78 us = 0.19% latency, 0 FLOPS)
	)
	(self_attn): DiTSelfAttention(
	44.24 M = 0.55% Params, 197.3 GMACs = 0.9% MACs, 2.91 ms = 1.26% latency, 135.74 TFLOPS
	(q_norm): GemmaRMSNorm(120 = 0% Params, 0 MACs = 0% MACs, 443.94 us = 0.19% latency, 0 FLOPS)
	(k_norm): GemmaRMSNorm(120 = 0% Params, 0 MACs = 0% MACs, 227.69 us = 0.1% latency, 0 FLOPS)
	(q_proj): Linear(14.75 M = 0.18% Params, 60.4 GMACs = 0.28% MACs, 319.48 us = 0.14% latency, 378.1 TFLOPS, in_features=3840, out_features=3840, bias=False)
	(k_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 191.21 us = 0.08% latency, 157.93 TFLOPS, in_features=3840, out_features=960, bias=False)
	(v_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 189.07 us = 0.08% latency, 159.73 TFLOPS, in_features=3840, out_features=960, bias=False)
	(text_k_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 161.89 us = 0.07% latency, 186.54 TFLOPS, in_features=3840, out_features=960, bias=False)
	(text_v_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 151.4 us = 0.07% latency, 199.47 TFLOPS, in_features=3840, out_features=960, bias=False)
	(o_proj): Linear(14.75 M = 0.18% Params, 60.4 GMACs = 0.28% MACs, 303.03 us = 0.13% latency, 398.63 TFLOPS, in_features=3840, out_features=3840, bias=False)
	)
	(post_attention_layernorm): GemmaRMSNorm(3.84 K = 0% Params, 0 MACs = 0% MACs, 430.35 us = 0.19% latency, 0 FLOPS)
	(mlp): GemmaMLP(
	117.96 M = 1.46% Params, 483.18 GMACs = 2.21% MACs, 2.01 ms = 0.87% latency, 480.6 TFLOPS
	(gate_proj): Linear(39.32 M = 0.49% Params, 161.06 GMACs = 0.74% MACs, 589.13 us = 0.26% latency, 546.77 TFLOPS, in_features=3840, out_features=10240, bias=False)
	(up_proj): Linear(39.32 M = 0.49% Params, 161.06 GMACs = 0.74% MACs, 592.23 us = 0.26% latency, 543.91 TFLOPS, in_features=3840, out_features=10240, bias=False)
	(down_proj): Linear(39.32 M = 0.49% Params, 161.06 GMACs = 0.74% MACs, 541.69 us = 0.24% latency, 594.67 TFLOPS, in_features=10240, out_features=3840, bias=False)
	(act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 95.61 us = 0.04% latency, 438.71 GFLOPS)
	)
	)
	(5): DiTLayer(
	250.71 M = 3.1% Params, 681.9 GMACs = 3.12% MACs, 6.84 ms = 2.97% latency, 199.5 TFLOPS
	(input_layernorm): AdaLayerNormZero(
	88.5 M = 1.1% Params, 1.42 GMACs = 0.01% MACs, 890.73 us = 0.39% latency, 3.18 TFLOPS
	(silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 34.81 us = 0.02% latency, 1.77 GFLOPS)
	(linear): Linear(88.5 M = 1.1% Params, 1.42 GMACs = 0.01% MACs, 222.21 us = 0.1% latency, 12.74 TFLOPS, in_features=3840, out_features=23040, bias=True)
	(norm): GemmaRMSNorm(3.84 K = 0% Params, 0 MACs = 0% MACs, 434.64 us = 0.19% latency, 0 FLOPS)
	)
	(self_attn): DiTSelfAttention(
	44.24 M = 0.55% Params, 197.3 GMACs = 0.9% MACs, 2.9 ms = 1.26% latency, 135.98 TFLOPS
	(q_norm): GemmaRMSNorm(120 = 0% Params, 0 MACs = 0% MACs, 442.03 us = 0.19% latency, 0 FLOPS)
	(k_norm): GemmaRMSNorm(120 = 0% Params, 0 MACs = 0% MACs, 235.56 us = 0.1% latency, 0 FLOPS)
	(q_proj): Linear(14.75 M = 0.18% Params, 60.4 GMACs = 0.28% MACs, 319 us = 0.14% latency, 378.67 TFLOPS, in_features=3840, out_features=3840, bias=False)
	(k_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 190.5 us = 0.08% latency, 158.53 TFLOPS, in_features=3840, out_features=960, bias=False)
	(v_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 188.59 us = 0.08% latency, 160.13 TFLOPS, in_features=3840, out_features=960, bias=False)
	(text_k_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 152.11 us = 0.07% latency, 198.53 TFLOPS, in_features=3840, out_features=960, bias=False)
	(text_v_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 149.73 us = 0.07% latency, 201.69 TFLOPS, in_features=3840, out_features=960, bias=False)
	(o_proj): Linear(14.75 M = 0.18% Params, 60.4 GMACs = 0.28% MACs, 305.89 us = 0.13% latency, 394.9 TFLOPS, in_features=3840, out_features=3840, bias=False)
	)
	(post_attention_layernorm): GemmaRMSNorm(3.84 K = 0% Params, 0 MACs = 0% MACs, 429.15 us = 0.19% latency, 0 FLOPS)
	(mlp): GemmaMLP(
	117.96 M = 1.46% Params, 483.18 GMACs = 2.21% MACs, 2 ms = 0.87% latency, 483.47 TFLOPS
	(gate_proj): Linear(39.32 M = 0.49% Params, 161.06 GMACs = 0.74% MACs, 586.27 us = 0.26% latency, 549.44 TFLOPS, in_features=3840, out_features=10240, bias=False)
	(up_proj): Linear(39.32 M = 0.49% Params, 161.06 GMACs = 0.74% MACs, 582.7 us = 0.25% latency, 552.82 TFLOPS, in_features=3840, out_features=10240, bias=False)
	(down_proj): Linear(39.32 M = 0.49% Params, 161.06 GMACs = 0.74% MACs, 537.63 us = 0.23% latency, 599.15 TFLOPS, in_features=10240, out_features=3840, bias=False)
	(act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 100.85 us = 0.04% latency, 415.89 GFLOPS)
	)
	)
	(6): DiTLayer(
	250.71 M = 3.1% Params, 681.9 GMACs = 3.12% MACs, 6.87 ms = 2.99% latency, 198.43 TFLOPS
	(input_layernorm): AdaLayerNormZero(
	88.5 M = 1.1% Params, 1.42 GMACs = 0.01% MACs, 888.11 us = 0.39% latency, 3.19 TFLOPS
	(silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 34.81 us = 0.02% latency, 1.77 GFLOPS)
	(linear): Linear(88.5 M = 1.1% Params, 1.42 GMACs = 0.01% MACs, 222.21 us = 0.1% latency, 12.74 TFLOPS, in_features=3840, out_features=23040, bias=True)
	(norm): GemmaRMSNorm(3.84 K = 0% Params, 0 MACs = 0% MACs, 433.68 us = 0.19% latency, 0 FLOPS)
	)
	(self_attn): DiTSelfAttention(
	44.24 M = 0.55% Params, 197.3 GMACs = 0.9% MACs, 2.91 ms = 1.27% latency, 135.51 TFLOPS
	(q_norm): GemmaRMSNorm(120 = 0% Params, 0 MACs = 0% MACs, 441.31 us = 0.19% latency, 0 FLOPS)
	(k_norm): GemmaRMSNorm(120 = 0% Params, 0 MACs = 0% MACs, 228.4 us = 0.1% latency, 0 FLOPS)
	(q_proj): Linear(14.75 M = 0.18% Params, 60.4 GMACs = 0.28% MACs, 319.48 us = 0.14% latency, 378.1 TFLOPS, in_features=3840, out_features=3840, bias=False)
	(k_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 192.64 us = 0.08% latency, 156.76 TFLOPS, in_features=3840, out_features=960, bias=False)
	(v_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 190.5 us = 0.08% latency, 158.53 TFLOPS, in_features=3840, out_features=960, bias=False)
	(text_k_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 155.21 us = 0.07% latency, 194.57 TFLOPS, in_features=3840, out_features=960, bias=False)
	(text_v_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 153.3 us = 0.07% latency, 196.99 TFLOPS, in_features=3840, out_features=960, bias=False)
	(o_proj): Linear(14.75 M = 0.18% Params, 60.4 GMACs = 0.28% MACs, 310.66 us = 0.14% latency, 388.84 TFLOPS, in_features=3840, out_features=3840, bias=False)
	)
	(post_attention_layernorm): GemmaRMSNorm(3.84 K = 0% Params, 0 MACs = 0% MACs, 430.58 us = 0.19% latency, 0 FLOPS)
	(mlp): GemmaMLP(
	117.96 M = 1.46% Params, 483.18 GMACs = 2.21% MACs, 2.02 ms = 0.88% latency, 477.55 TFLOPS
	(gate_proj): Linear(39.32 M = 0.49% Params, 161.06 GMACs = 0.74% MACs, 587.94 us = 0.26% latency, 547.88 TFLOPS, in_features=3840, out_features=10240, bias=False)
	(up_proj): Linear(39.32 M = 0.49% Params, 161.06 GMACs = 0.74% MACs, 589.37 us = 0.26% latency, 546.55 TFLOPS, in_features=3840, out_features=10240, bias=False)
	(down_proj): Linear(39.32 M = 0.49% Params, 161.06 GMACs = 0.74% MACs, 551.46 us = 0.24% latency, 584.12 TFLOPS, in_features=10240, out_features=3840, bias=False)
	(act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 96.8 us = 0.04% latency, 433.31 GFLOPS)
	)
	)
	(7): DiTLayer(
	250.71 M = 3.1% Params, 681.9 GMACs = 3.12% MACs, 6.89 ms = 3% latency, 198.02 TFLOPS
	(input_layernorm): AdaLayerNormZero(
	88.5 M = 1.1% Params, 1.42 GMACs = 0.01% MACs, 907.66 us = 0.39% latency, 3.12 TFLOPS
	(silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 38.15 us = 0.02% latency, 1.61 GFLOPS)
	(linear): Linear(88.5 M = 1.1% Params, 1.42 GMACs = 0.01% MACs, 232.46 us = 0.1% latency, 12.18 TFLOPS, in_features=3840, out_features=23040, bias=True)
	(norm): GemmaRMSNorm(3.84 K = 0% Params, 0 MACs = 0% MACs, 433.68 us = 0.19% latency, 0 FLOPS)
	)
	(self_attn): DiTSelfAttention(
	44.24 M = 0.55% Params, 197.3 GMACs = 0.9% MACs, 2.93 ms = 1.27% latency, 134.85 TFLOPS
	(q_norm): GemmaRMSNorm(120 = 0% Params, 0 MACs = 0% MACs, 442.03 us = 0.19% latency, 0 FLOPS)
	(k_norm): GemmaRMSNorm(120 = 0% Params, 0 MACs = 0% MACs, 227.21 us = 0.1% latency, 0 FLOPS)
	(q_proj): Linear(14.75 M = 0.18% Params, 60.4 GMACs = 0.28% MACs, 320.2 us = 0.14% latency, 377.26 TFLOPS, in_features=3840, out_features=3840, bias=False)
	(k_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 190.97 us = 0.08% latency, 158.13 TFLOPS, in_features=3840, out_features=960, bias=False)
	(v_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 188.83 us = 0.08% latency, 159.93 TFLOPS, in_features=3840, out_features=960, bias=False)
	(text_k_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 153.06 us = 0.07% latency, 197.3 TFLOPS, in_features=3840, out_features=960, bias=False)
	(text_v_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 149.49 us = 0.07% latency, 202.02 TFLOPS, in_features=3840, out_features=960, bias=False)
	(o_proj): Linear(14.75 M = 0.18% Params, 60.4 GMACs = 0.28% MACs, 319 us = 0.14% latency, 378.67 TFLOPS, in_features=3840, out_features=3840, bias=False)
	)
	(post_attention_layernorm): GemmaRMSNorm(3.84 K = 0% Params, 0 MACs = 0% MACs, 429.39 us = 0.19% latency, 0 FLOPS)
	(mlp): GemmaMLP(
	117.96 M = 1.46% Params, 483.18 GMACs = 2.21% MACs, 2 ms = 0.87% latency, 483.99 TFLOPS
	(gate_proj): Linear(39.32 M = 0.49% Params, 161.06 GMACs = 0.74% MACs, 585.79 us = 0.25% latency, 549.89 TFLOPS, in_features=3840, out_features=10240, bias=False)
	(up_proj): Linear(39.32 M = 0.49% Params, 161.06 GMACs = 0.74% MACs, 581.03 us = 0.25% latency, 554.4 TFLOPS, in_features=3840, out_features=10240, bias=False)
	(down_proj): Linear(39.32 M = 0.49% Params, 161.06 GMACs = 0.74% MACs, 537.4 us = 0.23% latency, 599.41 TFLOPS, in_features=10240, out_features=3840, bias=False)
	(act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 95.61 us = 0.04% latency, 438.71 GFLOPS)
	)
	)
	(8): DiTLayer(
	250.71 M = 3.1% Params, 681.9 GMACs = 3.12% MACs, 6.92 ms = 3.01% latency, 197.04 TFLOPS
	(input_layernorm): AdaLayerNormZero(
	88.5 M = 1.1% Params, 1.42 GMACs = 0.01% MACs, 927.93 us = 0.4% latency, 3.05 TFLOPS
	(silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 41.48 us = 0.02% latency, 1.48 GFLOPS)
	(linear): Linear(88.5 M = 1.1% Params, 1.42 GMACs = 0.01% MACs, 241.28 us = 0.1% latency, 11.73 TFLOPS, in_features=3840, out_features=23040, bias=True)
	(norm): GemmaRMSNorm(3.84 K = 0% Params, 0 MACs = 0% MACs, 434.16 us = 0.19% latency, 0 FLOPS)
	)
	(self_attn): DiTSelfAttention(
	44.24 M = 0.55% Params, 197.3 GMACs = 0.9% MACs, 2.93 ms = 1.27% latency, 134.71 TFLOPS
	(q_norm): GemmaRMSNorm(120 = 0% Params, 0 MACs = 0% MACs, 443.7 us = 0.19% latency, 0 FLOPS)
	(k_norm): GemmaRMSNorm(120 = 0% Params, 0 MACs = 0% MACs, 227.93 us = 0.1% latency, 0 FLOPS)
	(q_proj): Linear(14.75 M = 0.18% Params, 60.4 GMACs = 0.28% MACs, 323.53 us = 0.14% latency, 373.36 TFLOPS, in_features=3840, out_features=3840, bias=False)
	(k_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 192.64 us = 0.08% latency, 156.76 TFLOPS, in_features=3840, out_features=960, bias=False)
	(v_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 190.02 us = 0.08% latency, 158.93 TFLOPS, in_features=3840, out_features=960, bias=False)
	(text_k_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 158.31 us = 0.07% latency, 190.76 TFLOPS, in_features=3840, out_features=960, bias=False)
	(text_v_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 155.45 us = 0.07% latency, 194.27 TFLOPS, in_features=3840, out_features=960, bias=False)
	(o_proj): Linear(14.75 M = 0.18% Params, 60.4 GMACs = 0.28% MACs, 309.71 us = 0.13% latency, 390.03 TFLOPS, in_features=3840, out_features=3840, bias=False)
	)
	(post_attention_layernorm): GemmaRMSNorm(3.84 K = 0% Params, 0 MACs = 0% MACs, 430.11 us = 0.19% latency, 0 FLOPS)
	(mlp): GemmaMLP(
	117.96 M = 1.46% Params, 483.18 GMACs = 2.21% MACs, 2.01 ms = 0.87% latency, 481.52 TFLOPS
	(gate_proj): Linear(39.32 M = 0.49% Params, 161.06 GMACs = 0.74% MACs, 588.42 us = 0.26% latency, 547.44 TFLOPS, in_features=3840, out_features=10240, bias=False)
	(up_proj): Linear(39.32 M = 0.49% Params, 161.06 GMACs = 0.74% MACs, 586.99 us = 0.26% latency, 548.77 TFLOPS, in_features=3840, out_features=10240, bias=False)
	(down_proj): Linear(39.32 M = 0.49% Params, 161.06 GMACs = 0.74% MACs, 542.16 us = 0.24% latency, 594.14 TFLOPS, in_features=10240, out_features=3840, bias=False)
	(act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 94.89 us = 0.04% latency, 442.01 GFLOPS)
	)
	)
	(9): DiTLayer(
	250.71 M = 3.1% Params, 681.9 GMACs = 3.12% MACs, 6.86 ms = 2.98% latency, 198.91 TFLOPS
	(input_layernorm): AdaLayerNormZero(
	88.5 M = 1.1% Params, 1.42 GMACs = 0.01% MACs, 909.33 us = 0.4% latency, 3.11 TFLOPS
	(silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 42.44 us = 0.02% latency, 1.45 GFLOPS)
	(linear): Linear(88.5 M = 1.1% Params, 1.42 GMACs = 0.01% MACs, 233.17 us = 0.1% latency, 12.14 TFLOPS, in_features=3840, out_features=23040, bias=True)
	(norm): GemmaRMSNorm(3.84 K = 0% Params, 0 MACs = 0% MACs, 433.68 us = 0.19% latency, 0 FLOPS)
	)
	(self_attn): DiTSelfAttention(
	44.24 M = 0.55% Params, 197.3 GMACs = 0.9% MACs, 2.91 ms = 1.27% latency, 135.66 TFLOPS
	(q_norm): GemmaRMSNorm(120 = 0% Params, 0 MACs = 0% MACs, 442.27 us = 0.19% latency, 0 FLOPS)
	(k_norm): GemmaRMSNorm(120 = 0% Params, 0 MACs = 0% MACs, 227.93 us = 0.1% latency, 0 FLOPS)
	(q_proj): Linear(14.75 M = 0.18% Params, 60.4 GMACs = 0.28% MACs, 318.77 us = 0.14% latency, 378.95 TFLOPS, in_features=3840, out_features=3840, bias=False)
	(k_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 191.21 us = 0.08% latency, 157.93 TFLOPS, in_features=3840, out_features=960, bias=False)
	(v_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 188.35 us = 0.08% latency, 160.33 TFLOPS, in_features=3840, out_features=960, bias=False)
	(text_k_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 152.35 us = 0.07% latency, 198.22 TFLOPS, in_features=3840, out_features=960, bias=False)
	(text_v_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 150.44 us = 0.07% latency, 200.73 TFLOPS, in_features=3840, out_features=960, bias=False)
	(o_proj): Linear(14.75 M = 0.18% Params, 60.4 GMACs = 0.28% MACs, 312.57 us = 0.14% latency, 386.46 TFLOPS, in_features=3840, out_features=3840, bias=False)
	)
	(post_attention_layernorm): GemmaRMSNorm(3.84 K = 0% Params, 0 MACs = 0% MACs, 429.87 us = 0.19% latency, 0 FLOPS)
	(mlp): GemmaMLP(
	117.96 M = 1.46% Params, 483.18 GMACs = 2.21% MACs, 2 ms = 0.87% latency, 484.34 TFLOPS
	(gate_proj): Linear(39.32 M = 0.49% Params, 161.06 GMACs = 0.74% MACs, 586.51 us = 0.26% latency, 549.22 TFLOPS, in_features=3840, out_features=10240, bias=False)
	(up_proj): Linear(39.32 M = 0.49% Params, 161.06 GMACs = 0.74% MACs, 583.41 us = 0.25% latency, 552.14 TFLOPS, in_features=3840, out_features=10240, bias=False)
	(down_proj): Linear(39.32 M = 0.49% Params, 161.06 GMACs = 0.74% MACs, 539.3 us = 0.23% latency, 597.29 TFLOPS, in_features=10240, out_features=3840, bias=False)
	(act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 94.18 us = 0.04% latency, 445.37 GFLOPS)
	)
	)
	(10): DiTLayer(
	250.71 M = 3.1% Params, 681.9 GMACs = 3.12% MACs, 6.84 ms = 2.97% latency, 199.44 TFLOPS
	(input_layernorm): AdaLayerNormZero(
	88.5 M = 1.1% Params, 1.42 GMACs = 0.01% MACs, 886.2 us = 0.39% latency, 3.19 TFLOPS
	(silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 35.29 us = 0.02% latency, 1.74 GFLOPS)
	(linear): Linear(88.5 M = 1.1% Params, 1.42 GMACs = 0.01% MACs, 221.97 us = 0.1% latency, 12.75 TFLOPS, in_features=3840, out_features=23040, bias=True)
	(norm): GemmaRMSNorm(3.84 K = 0% Params, 0 MACs = 0% MACs, 433.21 us = 0.19% latency, 0 FLOPS)
	)
	(self_attn): DiTSelfAttention(
	44.24 M = 0.55% Params, 197.3 GMACs = 0.9% MACs, 2.9 ms = 1.26% latency, 136.06 TFLOPS
	(q_norm): GemmaRMSNorm(120 = 0% Params, 0 MACs = 0% MACs, 442.5 us = 0.19% latency, 0 FLOPS)
	(k_norm): GemmaRMSNorm(120 = 0% Params, 0 MACs = 0% MACs, 226.97 us = 0.1% latency, 0 FLOPS)
	(q_proj): Linear(14.75 M = 0.18% Params, 60.4 GMACs = 0.28% MACs, 318.77 us = 0.14% latency, 378.95 TFLOPS, in_features=3840, out_features=3840, bias=False)
	(k_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 190.73 us = 0.08% latency, 158.33 TFLOPS, in_features=3840, out_features=960, bias=False)
	(v_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 188.59 us = 0.08% latency, 160.13 TFLOPS, in_features=3840, out_features=960, bias=False)
	(text_k_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 151.63 us = 0.07% latency, 199.16 TFLOPS, in_features=3840, out_features=960, bias=False)
	(text_v_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 149.73 us = 0.07% latency, 201.69 TFLOPS, in_features=3840, out_features=960, bias=False)
	(o_proj): Linear(14.75 M = 0.18% Params, 60.4 GMACs = 0.28% MACs, 306.61 us = 0.13% latency, 393.98 TFLOPS, in_features=3840, out_features=3840, bias=False)
	)
	(post_attention_layernorm): GemmaRMSNorm(3.84 K = 0% Params, 0 MACs = 0% MACs, 430.35 us = 0.19% latency, 0 FLOPS)
	(mlp): GemmaMLP(
	117.96 M = 1.46% Params, 483.18 GMACs = 2.21% MACs, 2.01 ms = 0.87% latency, 481.46 TFLOPS
	(gate_proj): Linear(39.32 M = 0.49% Params, 161.06 GMACs = 0.74% MACs, 588.18 us = 0.26% latency, 547.66 TFLOPS, in_features=3840, out_features=10240, bias=False)
	(up_proj): Linear(39.32 M = 0.49% Params, 161.06 GMACs = 0.74% MACs, 586.51 us = 0.26% latency, 549.22 TFLOPS, in_features=3840, out_features=10240, bias=False)
	(down_proj): Linear(39.32 M = 0.49% Params, 161.06 GMACs = 0.74% MACs, 542.16 us = 0.24% latency, 594.14 TFLOPS, in_features=10240, out_features=3840, bias=False)
	(act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 95.13 us = 0.04% latency, 440.91 GFLOPS)
	)
	)
	(11): DiTLayer(
	250.71 M = 3.1% Params, 681.9 GMACs = 3.12% MACs, 6.83 ms = 2.97% latency, 199.64 TFLOPS
	(input_layernorm): AdaLayerNormZero(
	88.5 M = 1.1% Params, 1.42 GMACs = 0.01% MACs, 894.07 us = 0.39% latency, 3.17 TFLOPS
	(silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 34.57 us = 0.02% latency, 1.78 GFLOPS)
	(linear): Linear(88.5 M = 1.1% Params, 1.42 GMACs = 0.01% MACs, 220.54 us = 0.1% latency, 12.84 TFLOPS, in_features=3840, out_features=23040, bias=True)
	(norm): GemmaRMSNorm(3.84 K = 0% Params, 0 MACs = 0% MACs, 442.03 us = 0.19% latency, 0 FLOPS)
	)
	(self_attn): DiTSelfAttention(
	44.24 M = 0.55% Params, 197.3 GMACs = 0.9% MACs, 2.9 ms = 1.26% latency, 135.9 TFLOPS
	(q_norm): GemmaRMSNorm(120 = 0% Params, 0 MACs = 0% MACs, 441.31 us = 0.19% latency, 0 FLOPS)
	(k_norm): GemmaRMSNorm(120 = 0% Params, 0 MACs = 0% MACs, 227.21 us = 0.1% latency, 0 FLOPS)
	(q_proj): Linear(14.75 M = 0.18% Params, 60.4 GMACs = 0.28% MACs, 319.48 us = 0.14% latency, 378.1 TFLOPS, in_features=3840, out_features=3840, bias=False)
	(k_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 191.69 us = 0.08% latency, 157.54 TFLOPS, in_features=3840, out_features=960, bias=False)
	(v_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 189.3 us = 0.08% latency, 159.53 TFLOPS, in_features=3840, out_features=960, bias=False)
	(text_k_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 153.78 us = 0.07% latency, 196.38 TFLOPS, in_features=3840, out_features=960, bias=False)
	(text_v_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 151.16 us = 0.07% latency, 199.79 TFLOPS, in_features=3840, out_features=960, bias=False)
	(o_proj): Linear(14.75 M = 0.18% Params, 60.4 GMACs = 0.28% MACs, 305.41 us = 0.13% latency, 395.52 TFLOPS, in_features=3840, out_features=3840, bias=False)
	)
	(post_attention_layernorm): GemmaRMSNorm(3.84 K = 0% Params, 0 MACs = 0% MACs, 430.58 us = 0.19% latency, 0 FLOPS)
	(mlp): GemmaMLP(
	117.96 M = 1.46% Params, 483.18 GMACs = 2.21% MACs, 1.99 ms = 0.87% latency, 485.9 TFLOPS
	(gate_proj): Linear(39.32 M = 0.49% Params, 161.06 GMACs = 0.74% MACs, 583.65 us = 0.25% latency, 551.91 TFLOPS, in_features=3840, out_features=10240, bias=False)
	(up_proj): Linear(39.32 M = 0.49% Params, 161.06 GMACs = 0.74% MACs, 580.31 us = 0.25% latency, 555.09 TFLOPS, in_features=3840, out_features=10240, bias=False)
	(down_proj): Linear(39.32 M = 0.49% Params, 161.06 GMACs = 0.74% MACs, 537.63 us = 0.23% latency, 599.15 TFLOPS, in_features=10240, out_features=3840, bias=False)
	(act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 94.65 us = 0.04% latency, 443.13 GFLOPS)
	)
	)
	(12): DiTLayer(
	250.71 M = 3.1% Params, 681.9 GMACs = 3.12% MACs, 6.88 ms = 2.99% latency, 198.31 TFLOPS
	(input_layernorm): AdaLayerNormZero(
	88.5 M = 1.1% Params, 1.42 GMACs = 0.01% MACs, 898.84 us = 0.39% latency, 3.15 TFLOPS
	(silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 35.29 us = 0.02% latency, 1.74 GFLOPS)
	(linear): Linear(88.5 M = 1.1% Params, 1.42 GMACs = 0.01% MACs, 226.02 us = 0.1% latency, 12.53 TFLOPS, in_features=3840, out_features=23040, bias=True)
	(norm): GemmaRMSNorm(3.84 K = 0% Params, 0 MACs = 0% MACs, 438.69 us = 0.19% latency, 0 FLOPS)
	)
	(self_attn): DiTSelfAttention(
	44.24 M = 0.55% Params, 197.3 GMACs = 0.9% MACs, 2.92 ms = 1.27% latency, 135.34 TFLOPS
	(q_norm): GemmaRMSNorm(120 = 0% Params, 0 MACs = 0% MACs, 442.98 us = 0.19% latency, 0 FLOPS)
	(k_norm): GemmaRMSNorm(120 = 0% Params, 0 MACs = 0% MACs, 228.88 us = 0.1% latency, 0 FLOPS)
	(q_proj): Linear(14.75 M = 0.18% Params, 60.4 GMACs = 0.28% MACs, 321.39 us = 0.14% latency, 375.86 TFLOPS, in_features=3840, out_features=3840, bias=False)
	(k_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 191.93 us = 0.08% latency, 157.35 TFLOPS, in_features=3840, out_features=960, bias=False)
	(v_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 189.78 us = 0.08% latency, 159.13 TFLOPS, in_features=3840, out_features=960, bias=False)
	(text_k_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 153.54 us = 0.07% latency, 196.68 TFLOPS, in_features=3840, out_features=960, bias=False)
	(text_v_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 154.97 us = 0.07% latency, 194.87 TFLOPS, in_features=3840, out_features=960, bias=False)
	(o_proj): Linear(14.75 M = 0.18% Params, 60.4 GMACs = 0.28% MACs, 306.84 us = 0.13% latency, 393.67 TFLOPS, in_features=3840, out_features=3840, bias=False)
	)
	(post_attention_layernorm): GemmaRMSNorm(3.84 K = 0% Params, 0 MACs = 0% MACs, 429.87 us = 0.19% latency, 0 FLOPS)
	(mlp): GemmaMLP(
	117.96 M = 1.46% Params, 483.18 GMACs = 2.21% MACs, 2.02 ms = 0.88% latency, 479.07 TFLOPS
	(gate_proj): Linear(39.32 M = 0.49% Params, 161.06 GMACs = 0.74% MACs, 589.61 us = 0.26% latency, 546.33 TFLOPS, in_features=3840, out_features=10240, bias=False)
	(up_proj): Linear(39.32 M = 0.49% Params, 161.06 GMACs = 0.74% MACs, 586.03 us = 0.25% latency, 549.67 TFLOPS, in_features=3840, out_features=10240, bias=False)
	(down_proj): Linear(39.32 M = 0.49% Params, 161.06 GMACs = 0.74% MACs, 543.12 us = 0.24% latency, 593.1 TFLOPS, in_features=10240, out_features=3840, bias=False)
	(act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 102.76 us = 0.04% latency, 408.17 GFLOPS)
	)
	)
	(13): DiTLayer(
	250.71 M = 3.1% Params, 681.9 GMACs = 3.12% MACs, 6.83 ms = 2.97% latency, 199.68 TFLOPS
	(input_layernorm): AdaLayerNormZero(
	88.5 M = 1.1% Params, 1.42 GMACs = 0.01% MACs, 882.63 us = 0.38% latency, 3.21 TFLOPS
	(silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 35.05 us = 0.02% latency, 1.75 GFLOPS)
	(linear): Linear(88.5 M = 1.1% Params, 1.42 GMACs = 0.01% MACs, 219.11 us = 0.1% latency, 12.92 TFLOPS, in_features=3840, out_features=23040, bias=True)
	(norm): GemmaRMSNorm(3.84 K = 0% Params, 0 MACs = 0% MACs, 431.54 us = 0.19% latency, 0 FLOPS)
	)
	(self_attn): DiTSelfAttention(
	44.24 M = 0.55% Params, 197.3 GMACs = 0.9% MACs, 2.91 ms = 1.26% latency, 135.73 TFLOPS
	(q_norm): GemmaRMSNorm(120 = 0% Params, 0 MACs = 0% MACs, 440.84 us = 0.19% latency, 0 FLOPS)
	(k_norm): GemmaRMSNorm(120 = 0% Params, 0 MACs = 0% MACs, 228.17 us = 0.1% latency, 0 FLOPS)
	(q_proj): Linear(14.75 M = 0.18% Params, 60.4 GMACs = 0.28% MACs, 325.2 us = 0.14% latency, 371.45 TFLOPS, in_features=3840, out_features=3840, bias=False)
	(k_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 190.73 us = 0.08% latency, 158.33 TFLOPS, in_features=3840, out_features=960, bias=False)
	(v_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 188.59 us = 0.08% latency, 160.13 TFLOPS, in_features=3840, out_features=960, bias=False)
	(text_k_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 151.4 us = 0.07% latency, 199.47 TFLOPS, in_features=3840, out_features=960, bias=False)
	(text_v_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 149.01 us = 0.06% latency, 202.66 TFLOPS, in_features=3840, out_features=960, bias=False)
	(o_proj): Linear(14.75 M = 0.18% Params, 60.4 GMACs = 0.28% MACs, 306.37 us = 0.13% latency, 394.28 TFLOPS, in_features=3840, out_features=3840, bias=False)
	)
	(post_attention_layernorm): GemmaRMSNorm(3.84 K = 0% Params, 0 MACs = 0% MACs, 430.11 us = 0.19% latency, 0 FLOPS)
	(mlp): GemmaMLP(
	117.96 M = 1.46% Params, 483.18 GMACs = 2.21% MACs, 1.99 ms = 0.87% latency, 484.63 TFLOPS
	(gate_proj): Linear(39.32 M = 0.49% Params, 161.06 GMACs = 0.74% MACs, 585.56 us = 0.25% latency, 550.11 TFLOPS, in_features=3840, out_features=10240, bias=False)
	(up_proj): Linear(39.32 M = 0.49% Params, 161.06 GMACs = 0.74% MACs, 581.26 us = 0.25% latency, 554.18 TFLOPS, in_features=3840, out_features=10240, bias=False)
	(down_proj): Linear(39.32 M = 0.49% Params, 161.06 GMACs = 0.74% MACs, 538.11 us = 0.23% latency, 598.62 TFLOPS, in_features=10240, out_features=3840, bias=False)
	(act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 95.13 us = 0.04% latency, 440.91 GFLOPS)
	)
	)
	(14): DiTLayer(
	250.71 M = 3.1% Params, 681.9 GMACs = 3.12% MACs, 6.87 ms = 2.99% latency, 198.59 TFLOPS
	(input_layernorm): AdaLayerNormZero(
	88.5 M = 1.1% Params, 1.42 GMACs = 0.01% MACs, 901.46 us = 0.39% latency, 3.14 TFLOPS
	(silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 41.96 us = 0.02% latency, 1.46 GFLOPS)
	(linear): Linear(88.5 M = 1.1% Params, 1.42 GMACs = 0.01% MACs, 223.88 us = 0.1% latency, 12.65 TFLOPS, in_features=3840, out_features=23040, bias=True)
	(norm): GemmaRMSNorm(3.84 K = 0% Params, 0 MACs = 0% MACs, 437.02 us = 0.19% latency, 0 FLOPS)
	)
	(self_attn): DiTSelfAttention(
	44.24 M = 0.55% Params, 197.3 GMACs = 0.9% MACs, 2.9 ms = 1.26% latency, 136.11 TFLOPS
	(q_norm): GemmaRMSNorm(120 = 0% Params, 0 MACs = 0% MACs, 442.74 us = 0.19% latency, 0 FLOPS)
	(k_norm): GemmaRMSNorm(120 = 0% Params, 0 MACs = 0% MACs, 226.74 us = 0.1% latency, 0 FLOPS)
	(q_proj): Linear(14.75 M = 0.18% Params, 60.4 GMACs = 0.28% MACs, 319.24 us = 0.14% latency, 378.38 TFLOPS, in_features=3840, out_features=3840, bias=False)
	(k_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 191.21 us = 0.08% latency, 157.93 TFLOPS, in_features=3840, out_features=960, bias=False)
	(v_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 188.83 us = 0.08% latency, 159.93 TFLOPS, in_features=3840, out_features=960, bias=False)
	(text_k_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 150.68 us = 0.07% latency, 200.42 TFLOPS, in_features=3840, out_features=960, bias=False)
	(text_v_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 148.77 us = 0.06% latency, 202.99 TFLOPS, in_features=3840, out_features=960, bias=False)
	(o_proj): Linear(14.75 M = 0.18% Params, 60.4 GMACs = 0.28% MACs, 309.23 us = 0.13% latency, 390.64 TFLOPS, in_features=3840, out_features=3840, bias=False)
	)
	(post_attention_layernorm): GemmaRMSNorm(3.84 K = 0% Params, 0 MACs = 0% MACs, 431.54 us = 0.19% latency, 0 FLOPS)
	(mlp): GemmaMLP(
	117.96 M = 1.46% Params, 483.18 GMACs = 2.21% MACs, 2.02 ms = 0.88% latency, 478.79 TFLOPS
	(gate_proj): Linear(39.32 M = 0.49% Params, 161.06 GMACs = 0.74% MACs, 590.56 us = 0.26% latency, 545.45 TFLOPS, in_features=3840, out_features=10240, bias=False)
	(up_proj): Linear(39.32 M = 0.49% Params, 161.06 GMACs = 0.74% MACs, 588.66 us = 0.26% latency, 547.22 TFLOPS, in_features=3840, out_features=10240, bias=False)
	(down_proj): Linear(39.32 M = 0.49% Params, 161.06 GMACs = 0.74% MACs, 543.36 us = 0.24% latency, 592.84 TFLOPS, in_features=10240, out_features=3840, bias=False)
	(act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 97.04 us = 0.04% latency, 432.24 GFLOPS)
	)
	)
	(15): DiTLayer(
	250.71 M = 3.1% Params, 681.9 GMACs = 3.12% MACs, 6.89 ms = 3% latency, 197.98 TFLOPS
	(input_layernorm): AdaLayerNormZero(
	88.5 M = 1.1% Params, 1.42 GMACs = 0.01% MACs, 910.28 us = 0.4% latency, 3.11 TFLOPS
	(silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 36.95 us = 0.02% latency, 1.66 GFLOPS)
	(linear): Linear(88.5 M = 1.1% Params, 1.42 GMACs = 0.01% MACs, 236.51 us = 0.1% latency, 11.97 TFLOPS, in_features=3840, out_features=23040, bias=True)
	(norm): GemmaRMSNorm(3.84 K = 0% Params, 0 MACs = 0% MACs, 434.88 us = 0.19% latency, 0 FLOPS)
	)
	(self_attn): DiTSelfAttention(
	44.24 M = 0.55% Params, 197.3 GMACs = 0.9% MACs, 2.91 ms = 1.27% latency, 135.52 TFLOPS
	(q_norm): GemmaRMSNorm(120 = 0% Params, 0 MACs = 0% MACs, 440.36 us = 0.19% latency, 0 FLOPS)
	(k_norm): GemmaRMSNorm(120 = 0% Params, 0 MACs = 0% MACs, 227.93 us = 0.1% latency, 0 FLOPS)
	(q_proj): Linear(14.75 M = 0.18% Params, 60.4 GMACs = 0.28% MACs, 319 us = 0.14% latency, 378.67 TFLOPS, in_features=3840, out_features=3840, bias=False)
	(k_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 191.45 us = 0.08% latency, 157.74 TFLOPS, in_features=3840, out_features=960, bias=False)
	(v_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 190.26 us = 0.08% latency, 158.73 TFLOPS, in_features=3840, out_features=960, bias=False)
	(text_k_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 152.83 us = 0.07% latency, 197.6 TFLOPS, in_features=3840, out_features=960, bias=False)
	(text_v_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 150.68 us = 0.07% latency, 200.42 TFLOPS, in_features=3840, out_features=960, bias=False)
	(o_proj): Linear(14.75 M = 0.18% Params, 60.4 GMACs = 0.28% MACs, 310.42 us = 0.14% latency, 389.14 TFLOPS, in_features=3840, out_features=3840, bias=False)
	)
	(post_attention_layernorm): GemmaRMSNorm(3.84 K = 0% Params, 0 MACs = 0% MACs, 438.93 us = 0.19% latency, 0 FLOPS)
	(mlp): GemmaMLP(
	117.96 M = 1.46% Params, 483.18 GMACs = 2.21% MACs, 2.01 ms = 0.87% latency, 481.35 TFLOPS
	(gate_proj): Linear(39.32 M = 0.49% Params, 161.06 GMACs = 0.74% MACs, 587.7 us = 0.26% latency, 548.11 TFLOPS, in_features=3840, out_features=10240, bias=False)
	(up_proj): Linear(39.32 M = 0.49% Params, 161.06 GMACs = 0.74% MACs, 582.46 us = 0.25% latency, 553.04 TFLOPS, in_features=3840, out_features=10240, bias=False)
	(down_proj): Linear(39.32 M = 0.49% Params, 161.06 GMACs = 0.74% MACs, 543.83 us = 0.24% latency, 592.32 TFLOPS, in_features=10240, out_features=3840, bias=False)
	(act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 96.8 us = 0.04% latency, 433.31 GFLOPS)
	)
	)
	(16): DiTLayer(
	250.71 M = 3.1% Params, 681.9 GMACs = 3.12% MACs, 6.96 ms = 3.03% latency, 195.89 TFLOPS
	(input_layernorm): AdaLayerNormZero(
	88.5 M = 1.1% Params, 1.42 GMACs = 0.01% MACs, 904.56 us = 0.39% latency, 3.13 TFLOPS
	(silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 37.43 us = 0.02% latency, 1.64 GFLOPS)
	(linear): Linear(88.5 M = 1.1% Params, 1.42 GMACs = 0.01% MACs, 226.74 us = 0.1% latency, 12.49 TFLOPS, in_features=3840, out_features=23040, bias=True)
	(norm): GemmaRMSNorm(3.84 K = 0% Params, 0 MACs = 0% MACs, 432.49 us = 0.19% latency, 0 FLOPS)
	)
	(self_attn): DiTSelfAttention(
	44.24 M = 0.55% Params, 197.3 GMACs = 0.9% MACs, 2.95 ms = 1.28% latency, 133.64 TFLOPS
	(q_norm): GemmaRMSNorm(120 = 0% Params, 0 MACs = 0% MACs, 443.94 us = 0.19% latency, 0 FLOPS)
	(k_norm): GemmaRMSNorm(120 = 0% Params, 0 MACs = 0% MACs, 229.84 us = 0.1% latency, 0 FLOPS)
	(q_proj): Linear(14.75 M = 0.18% Params, 60.4 GMACs = 0.28% MACs, 321.63 us = 0.14% latency, 375.58 TFLOPS, in_features=3840, out_features=3840, bias=False)
	(k_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 194.31 us = 0.08% latency, 155.42 TFLOPS, in_features=3840, out_features=960, bias=False)
	(v_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 190.26 us = 0.08% latency, 158.73 TFLOPS, in_features=3840, out_features=960, bias=False)
	(text_k_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 160.46 us = 0.07% latency, 188.21 TFLOPS, in_features=3840, out_features=960, bias=False)
	(text_v_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 159.98 us = 0.07% latency, 188.77 TFLOPS, in_features=3840, out_features=960, bias=False)
	(o_proj): Linear(14.75 M = 0.18% Params, 60.4 GMACs = 0.28% MACs, 318.53 us = 0.14% latency, 379.23 TFLOPS, in_features=3840, out_features=3840, bias=False)
	)
	(post_attention_layernorm): GemmaRMSNorm(3.84 K = 0% Params, 0 MACs = 0% MACs, 430.11 us = 0.19% latency, 0 FLOPS)
	(mlp): GemmaMLP(
	117.96 M = 1.46% Params, 483.18 GMACs = 2.21% MACs, 2.04 ms = 0.89% latency, 472.76 TFLOPS
	(gate_proj): Linear(39.32 M = 0.49% Params, 161.06 GMACs = 0.74% MACs, 590.8 us = 0.26% latency, 545.23 TFLOPS, in_features=3840, out_features=10240, bias=False)
	(up_proj): Linear(39.32 M = 0.49% Params, 161.06 GMACs = 0.74% MACs, 591.99 us = 0.26% latency, 544.13 TFLOPS, in_features=3840, out_features=10240, bias=False)
	(down_proj): Linear(39.32 M = 0.49% Params, 161.06 GMACs = 0.74% MACs, 551.7 us = 0.24% latency, 583.87 TFLOPS, in_features=10240, out_features=3840, bias=False)
	(act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 101.09 us = 0.04% latency, 414.91 GFLOPS)
	)
	)
	(17): DiTLayer(
	250.71 M = 3.1% Params, 681.9 GMACs = 3.12% MACs, 6.89 ms = 2.99% latency, 198.08 TFLOPS
	(input_layernorm): AdaLayerNormZero(
	88.5 M = 1.1% Params, 1.42 GMACs = 0.01% MACs, 909.81 us = 0.4% latency, 3.11 TFLOPS
	(silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 39.1 us = 0.02% latency, 1.57 GFLOPS)
	(linear): Linear(88.5 M = 1.1% Params, 1.42 GMACs = 0.01% MACs, 234.37 us = 0.1% latency, 12.08 TFLOPS, in_features=3840, out_features=23040, bias=True)
	(norm): GemmaRMSNorm(3.84 K = 0% Params, 0 MACs = 0% MACs, 433.44 us = 0.19% latency, 0 FLOPS)
	)
	(self_attn): DiTSelfAttention(
	44.24 M = 0.55% Params, 197.3 GMACs = 0.9% MACs, 2.93 ms = 1.27% latency, 134.73 TFLOPS
	(q_norm): GemmaRMSNorm(120 = 0% Params, 0 MACs = 0% MACs, 442.74 us = 0.19% latency, 0 FLOPS)
	(k_norm): GemmaRMSNorm(120 = 0% Params, 0 MACs = 0% MACs, 226.5 us = 0.1% latency, 0 FLOPS)
	(q_proj): Linear(14.75 M = 0.18% Params, 60.4 GMACs = 0.28% MACs, 320.43 us = 0.14% latency, 376.98 TFLOPS, in_features=3840, out_features=3840, bias=False)
	(k_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 192.64 us = 0.08% latency, 156.76 TFLOPS, in_features=3840, out_features=960, bias=False)
	(v_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 189.54 us = 0.08% latency, 159.33 TFLOPS, in_features=3840, out_features=960, bias=False)
	(text_k_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 166.89 us = 0.07% latency, 180.95 TFLOPS, in_features=3840, out_features=960, bias=False)
	(text_v_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 156.4 us = 0.07% latency, 193.08 TFLOPS, in_features=3840, out_features=960, bias=False)
	(o_proj): Linear(14.75 M = 0.18% Params, 60.4 GMACs = 0.28% MACs, 303.98 us = 0.13% latency, 397.38 TFLOPS, in_features=3840, out_features=3840, bias=False)
	)
	(post_attention_layernorm): GemmaRMSNorm(3.84 K = 0% Params, 0 MACs = 0% MACs, 430.35 us = 0.19% latency, 0 FLOPS)
	(mlp): GemmaMLP(
	117.96 M = 1.46% Params, 483.18 GMACs = 2.21% MACs, 2 ms = 0.87% latency, 483.24 TFLOPS
	(gate_proj): Linear(39.32 M = 0.49% Params, 161.06 GMACs = 0.74% MACs, 586.03 us = 0.25% latency, 549.67 TFLOPS, in_features=3840, out_features=10240, bias=False)
	(up_proj): Linear(39.32 M = 0.49% Params, 161.06 GMACs = 0.74% MACs, 583.89 us = 0.25% latency, 551.69 TFLOPS, in_features=3840, out_features=10240, bias=False)
	(down_proj): Linear(39.32 M = 0.49% Params, 161.06 GMACs = 0.74% MACs, 539.06 us = 0.23% latency, 597.56 TFLOPS, in_features=10240, out_features=3840, bias=False)
	(act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 94.41 us = 0.04% latency, 444.25 GFLOPS)
	)
	)
	(18): DiTLayer(
	250.71 M = 3.1% Params, 681.9 GMACs = 3.12% MACs, 6.9 ms = 3% latency, 197.55 TFLOPS
	(input_layernorm): AdaLayerNormZero(
	88.5 M = 1.1% Params, 1.42 GMACs = 0.01% MACs, 895.98 us = 0.39% latency, 3.16 TFLOPS
	(silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 36.24 us = 0.02% latency, 1.7 GFLOPS)
	(linear): Linear(88.5 M = 1.1% Params, 1.42 GMACs = 0.01% MACs, 226.02 us = 0.1% latency, 12.53 TFLOPS, in_features=3840, out_features=23040, bias=True)
	(norm): GemmaRMSNorm(3.84 K = 0% Params, 0 MACs = 0% MACs, 433.68 us = 0.19% latency, 0 FLOPS)
	)
	(self_attn): DiTSelfAttention(
	44.24 M = 0.55% Params, 197.3 GMACs = 0.9% MACs, 2.93 ms = 1.27% latency, 134.77 TFLOPS
	(q_norm): GemmaRMSNorm(120 = 0% Params, 0 MACs = 0% MACs, 442.5 us = 0.19% latency, 0 FLOPS)
	(k_norm): GemmaRMSNorm(120 = 0% Params, 0 MACs = 0% MACs, 228.64 us = 0.1% latency, 0 FLOPS)
	(q_proj): Linear(14.75 M = 0.18% Params, 60.4 GMACs = 0.28% MACs, 319.48 us = 0.14% latency, 378.1 TFLOPS, in_features=3840, out_features=3840, bias=False)
	(k_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 192.17 us = 0.08% latency, 157.15 TFLOPS, in_features=3840, out_features=960, bias=False)
	(v_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 189.78 us = 0.08% latency, 159.13 TFLOPS, in_features=3840, out_features=960, bias=False)
	(text_k_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 154.5 us = 0.07% latency, 195.47 TFLOPS, in_features=3840, out_features=960, bias=False)
	(text_v_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 149.73 us = 0.07% latency, 201.69 TFLOPS, in_features=3840, out_features=960, bias=False)
	(o_proj): Linear(14.75 M = 0.18% Params, 60.4 GMACs = 0.28% MACs, 316.14 us = 0.14% latency, 382.09 TFLOPS, in_features=3840, out_features=3840, bias=False)
	)
	(post_attention_layernorm): GemmaRMSNorm(3.84 K = 0% Params, 0 MACs = 0% MACs, 431.3 us = 0.19% latency, 0 FLOPS)
	(mlp): GemmaMLP(
	117.96 M = 1.46% Params, 483.18 GMACs = 2.21% MACs, 2.03 ms = 0.88% latency, 476.87 TFLOPS
	(gate_proj): Linear(39.32 M = 0.49% Params, 161.06 GMACs = 0.74% MACs, 592.23 us = 0.26% latency, 543.91 TFLOPS, in_features=3840, out_features=10240, bias=False)
	(up_proj): Linear(39.32 M = 0.49% Params, 161.06 GMACs = 0.74% MACs, 597 us = 0.26% latency, 539.57 TFLOPS, in_features=3840, out_features=10240, bias=False)
	(down_proj): Linear(39.32 M = 0.49% Params, 161.06 GMACs = 0.74% MACs, 543.83 us = 0.24% latency, 592.32 TFLOPS, in_features=10240, out_features=3840, bias=False)
	(act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 96.8 us = 0.04% latency, 433.31 GFLOPS)
	)
	)
	(19): DiTLayer(
	250.71 M = 3.1% Params, 681.9 GMACs = 3.12% MACs, 6.86 ms = 2.98% latency, 198.75 TFLOPS
	(input_layernorm): AdaLayerNormZero(
	88.5 M = 1.1% Params, 1.42 GMACs = 0.01% MACs, 901.94 us = 0.39% latency, 3.14 TFLOPS
	(silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 41.25 us = 0.02% latency, 1.49 GFLOPS)
	(linear): Linear(88.5 M = 1.1% Params, 1.42 GMACs = 0.01% MACs, 228.88 us = 0.1% latency, 12.37 TFLOPS, in_features=3840, out_features=23040, bias=True)
	(norm): GemmaRMSNorm(3.84 K = 0% Params, 0 MACs = 0% MACs, 432.73 us = 0.19% latency, 0 FLOPS)
	)
	(self_attn): DiTSelfAttention(
	44.24 M = 0.55% Params, 197.3 GMACs = 0.9% MACs, 2.92 ms = 1.27% latency, 135.11 TFLOPS
	(q_norm): GemmaRMSNorm(120 = 0% Params, 0 MACs = 0% MACs, 439.88 us = 0.19% latency, 0 FLOPS)
	(k_norm): GemmaRMSNorm(120 = 0% Params, 0 MACs = 0% MACs, 228.4 us = 0.1% latency, 0 FLOPS)
	(q_proj): Linear(14.75 M = 0.18% Params, 60.4 GMACs = 0.28% MACs, 317.34 us = 0.14% latency, 380.66 TFLOPS, in_features=3840, out_features=3840, bias=False)
	(k_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 192.17 us = 0.08% latency, 157.15 TFLOPS, in_features=3840, out_features=960, bias=False)
	(v_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 189.54 us = 0.08% latency, 159.33 TFLOPS, in_features=3840, out_features=960, bias=False)
	(text_k_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 151.63 us = 0.07% latency, 199.16 TFLOPS, in_features=3840, out_features=960, bias=False)
	(text_v_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 151.87 us = 0.07% latency, 198.84 TFLOPS, in_features=3840, out_features=960, bias=False)
	(o_proj): Linear(14.75 M = 0.18% Params, 60.4 GMACs = 0.28% MACs, 323.06 us = 0.14% latency, 373.92 TFLOPS, in_features=3840, out_features=3840, bias=False)
	)
	(post_attention_layernorm): GemmaRMSNorm(3.84 K = 0% Params, 0 MACs = 0% MACs, 429.87 us = 0.19% latency, 0 FLOPS)
	(mlp): GemmaMLP(
	117.96 M = 1.46% Params, 483.18 GMACs = 2.21% MACs, 1.99 ms = 0.87% latency, 484.74 TFLOPS
	(gate_proj): Linear(39.32 M = 0.49% Params, 161.06 GMACs = 0.74% MACs, 584.36 us = 0.25% latency, 551.24 TFLOPS, in_features=3840, out_features=10240, bias=False)
	(up_proj): Linear(39.32 M = 0.49% Params, 161.06 GMACs = 0.74% MACs, 580.55 us = 0.25% latency, 554.86 TFLOPS, in_features=3840, out_features=10240, bias=False)
	(down_proj): Linear(39.32 M = 0.49% Params, 161.06 GMACs = 0.74% MACs, 539.3 us = 0.23% latency, 597.29 TFLOPS, in_features=10240, out_features=3840, bias=False)
	(act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 95.13 us = 0.04% latency, 440.91 GFLOPS)
	)
	)
	(20): DiTLayer(
	250.71 M = 3.1% Params, 681.9 GMACs = 3.12% MACs, 6.89 ms = 3% latency, 198.06 TFLOPS
	(input_layernorm): AdaLayerNormZero(
	88.5 M = 1.1% Params, 1.42 GMACs = 0.01% MACs, 900.98 us = 0.39% latency, 3.14 TFLOPS
	(silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 36.48 us = 0.02% latency, 1.68 GFLOPS)
	(linear): Linear(88.5 M = 1.1% Params, 1.42 GMACs = 0.01% MACs, 224.35 us = 0.1% latency, 12.62 TFLOPS, in_features=3840, out_features=23040, bias=True)
	(norm): GemmaRMSNorm(3.84 K = 0% Params, 0 MACs = 0% MACs, 434.16 us = 0.19% latency, 0 FLOPS)
	)
	(self_attn): DiTSelfAttention(
	44.24 M = 0.55% Params, 197.3 GMACs = 0.9% MACs, 2.92 ms = 1.27% latency, 135.01 TFLOPS
	(q_norm): GemmaRMSNorm(120 = 0% Params, 0 MACs = 0% MACs, 443.22 us = 0.19% latency, 0 FLOPS)
	(k_norm): GemmaRMSNorm(120 = 0% Params, 0 MACs = 0% MACs, 229.6 us = 0.1% latency, 0 FLOPS)
	(q_proj): Linear(14.75 M = 0.18% Params, 60.4 GMACs = 0.28% MACs, 322.1 us = 0.14% latency, 375.02 TFLOPS, in_features=3840, out_features=3840, bias=False)
	(k_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 192.88 us = 0.08% latency, 156.57 TFLOPS, in_features=3840, out_features=960, bias=False)
	(v_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 191.69 us = 0.08% latency, 157.54 TFLOPS, in_features=3840, out_features=960, bias=False)
	(text_k_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 158.79 us = 0.07% latency, 190.19 TFLOPS, in_features=3840, out_features=960, bias=False)
	(text_v_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 152.59 us = 0.07% latency, 197.91 TFLOPS, in_features=3840, out_features=960, bias=False)
	(o_proj): Linear(14.75 M = 0.18% Params, 60.4 GMACs = 0.28% MACs, 305.89 us = 0.13% latency, 394.9 TFLOPS, in_features=3840, out_features=3840, bias=False)
	)
	(post_attention_layernorm): GemmaRMSNorm(3.84 K = 0% Params, 0 MACs = 0% MACs, 431.06 us = 0.19% latency, 0 FLOPS)
	(mlp): GemmaMLP(
	117.96 M = 1.46% Params, 483.18 GMACs = 2.21% MACs, 2.02 ms = 0.88% latency, 479.47 TFLOPS
	(gate_proj): Linear(39.32 M = 0.49% Params, 161.06 GMACs = 0.74% MACs, 589.13 us = 0.26% latency, 546.77 TFLOPS, in_features=3840, out_features=10240, bias=False)
	(up_proj): Linear(39.32 M = 0.49% Params, 161.06 GMACs = 0.74% MACs, 587.22 us = 0.26% latency, 548.55 TFLOPS, in_features=3840, out_features=10240, bias=False)
	(down_proj): Linear(39.32 M = 0.49% Params, 161.06 GMACs = 0.74% MACs, 546.93 us = 0.24% latency, 588.96 TFLOPS, in_features=10240, out_features=3840, bias=False)
	(act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 96.08 us = 0.04% latency, 436.53 GFLOPS)
	)
	)
	(21): DiTLayer(
	250.71 M = 3.1% Params, 681.9 GMACs = 3.12% MACs, 6.85 ms = 2.98% latency, 199.02 TFLOPS
	(input_layernorm): AdaLayerNormZero(
	88.5 M = 1.1% Params, 1.42 GMACs = 0.01% MACs, 904.32 us = 0.39% latency, 3.13 TFLOPS
	(silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 38.39 us = 0.02% latency, 1.6 GFLOPS)
	(linear): Linear(88.5 M = 1.1% Params, 1.42 GMACs = 0.01% MACs, 231.27 us = 0.1% latency, 12.24 TFLOPS, in_features=3840, out_features=23040, bias=True)
	(norm): GemmaRMSNorm(3.84 K = 0% Params, 0 MACs = 0% MACs, 433.21 us = 0.19% latency, 0 FLOPS)
	)
	(self_attn): DiTSelfAttention(
	44.24 M = 0.55% Params, 197.3 GMACs = 0.9% MACs, 2.9 ms = 1.26% latency, 136.24 TFLOPS
	(q_norm): GemmaRMSNorm(120 = 0% Params, 0 MACs = 0% MACs, 441.55 us = 0.19% latency, 0 FLOPS)
	(k_norm): GemmaRMSNorm(120 = 0% Params, 0 MACs = 0% MACs, 226.97 us = 0.1% latency, 0 FLOPS)
	(q_proj): Linear(14.75 M = 0.18% Params, 60.4 GMACs = 0.28% MACs, 319.96 us = 0.14% latency, 377.54 TFLOPS, in_features=3840, out_features=3840, bias=False)
	(k_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 191.69 us = 0.08% latency, 157.54 TFLOPS, in_features=3840, out_features=960, bias=False)
	(v_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 190.02 us = 0.08% latency, 158.93 TFLOPS, in_features=3840, out_features=960, bias=False)
	(text_k_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 151.4 us = 0.07% latency, 199.47 TFLOPS, in_features=3840, out_features=960, bias=False)
	(text_v_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 151.4 us = 0.07% latency, 199.47 TFLOPS, in_features=3840, out_features=960, bias=False)
	(o_proj): Linear(14.75 M = 0.18% Params, 60.4 GMACs = 0.28% MACs, 302.08 us = 0.13% latency, 399.89 TFLOPS, in_features=3840, out_features=3840, bias=False)
	)
	(post_attention_layernorm): GemmaRMSNorm(3.84 K = 0% Params, 0 MACs = 0% MACs, 431.78 us = 0.19% latency, 0 FLOPS)
	(mlp): GemmaMLP(
	117.96 M = 1.46% Params, 483.18 GMACs = 2.21% MACs, 2 ms = 0.87% latency, 483.18 TFLOPS
	(gate_proj): Linear(39.32 M = 0.49% Params, 161.06 GMACs = 0.74% MACs, 585.79 us = 0.25% latency, 549.89 TFLOPS, in_features=3840, out_features=10240, bias=False)
	(up_proj): Linear(39.32 M = 0.49% Params, 161.06 GMACs = 0.74% MACs, 582.7 us = 0.25% latency, 552.82 TFLOPS, in_features=3840, out_features=10240, bias=False)
	(down_proj): Linear(39.32 M = 0.49% Params, 161.06 GMACs = 0.74% MACs, 540.97 us = 0.24% latency, 595.45 TFLOPS, in_features=10240, out_features=3840, bias=False)
	(act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 94.89 us = 0.04% latency, 442.01 GFLOPS)
	)
	)
	(22): DiTLayer(
	250.71 M = 3.1% Params, 681.9 GMACs = 3.12% MACs, 6.86 ms = 2.98% latency, 198.89 TFLOPS
	(input_layernorm): AdaLayerNormZero(
	88.5 M = 1.1% Params, 1.42 GMACs = 0.01% MACs, 887.87 us = 0.39% latency, 3.19 TFLOPS
	(silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 35.05 us = 0.02% latency, 1.75 GFLOPS)
	(linear): Linear(88.5 M = 1.1% Params, 1.42 GMACs = 0.01% MACs, 222.68 us = 0.1% latency, 12.71 TFLOPS, in_features=3840, out_features=23040, bias=True)
	(norm): GemmaRMSNorm(3.84 K = 0% Params, 0 MACs = 0% MACs, 432.73 us = 0.19% latency, 0 FLOPS)
	)
	(self_attn): DiTSelfAttention(
	44.24 M = 0.55% Params, 197.3 GMACs = 0.9% MACs, 2.91 ms = 1.27% latency, 135.55 TFLOPS
	(q_norm): GemmaRMSNorm(120 = 0% Params, 0 MACs = 0% MACs, 442.98 us = 0.19% latency, 0 FLOPS)
	(k_norm): GemmaRMSNorm(120 = 0% Params, 0 MACs = 0% MACs, 227.21 us = 0.1% latency, 0 FLOPS)
	(q_proj): Linear(14.75 M = 0.18% Params, 60.4 GMACs = 0.28% MACs, 319.24 us = 0.14% latency, 378.38 TFLOPS, in_features=3840, out_features=3840, bias=False)
	(k_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 191.45 us = 0.08% latency, 157.74 TFLOPS, in_features=3840, out_features=960, bias=False)
	(v_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 190.26 us = 0.08% latency, 158.73 TFLOPS, in_features=3840, out_features=960, bias=False)
	(text_k_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 154.26 us = 0.07% latency, 195.77 TFLOPS, in_features=3840, out_features=960, bias=False)
	(text_v_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 154.97 us = 0.07% latency, 194.87 TFLOPS, in_features=3840, out_features=960, bias=False)
	(o_proj): Linear(14.75 M = 0.18% Params, 60.4 GMACs = 0.28% MACs, 308.75 us = 0.13% latency, 391.24 TFLOPS, in_features=3840, out_features=3840, bias=False)
	)
	(post_attention_layernorm): GemmaRMSNorm(3.84 K = 0% Params, 0 MACs = 0% MACs, 431.54 us = 0.19% latency, 0 FLOPS)
	(mlp): GemmaMLP(
	117.96 M = 1.46% Params, 483.18 GMACs = 2.21% MACs, 2.01 ms = 0.87% latency, 481.46 TFLOPS
	(gate_proj): Linear(39.32 M = 0.49% Params, 161.06 GMACs = 0.74% MACs, 588.42 us = 0.26% latency, 547.44 TFLOPS, in_features=3840, out_features=10240, bias=False)
	(up_proj): Linear(39.32 M = 0.49% Params, 161.06 GMACs = 0.74% MACs, 587.46 us = 0.26% latency, 548.33 TFLOPS, in_features=3840, out_features=10240, bias=False)
	(down_proj): Linear(39.32 M = 0.49% Params, 161.06 GMACs = 0.74% MACs, 541.69 us = 0.24% latency, 594.67 TFLOPS, in_features=10240, out_features=3840, bias=False)
	(act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 94.89 us = 0.04% latency, 442.01 GFLOPS)
	)
	)
	(23): DiTLayer(
	250.71 M = 3.1% Params, 681.9 GMACs = 3.12% MACs, 6.88 ms = 2.99% latency, 198.15 TFLOPS
	(input_layernorm): AdaLayerNormZero(
	88.5 M = 1.1% Params, 1.42 GMACs = 0.01% MACs, 914.34 us = 0.4% latency, 3.1 TFLOPS
	(silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 37.19 us = 0.02% latency, 1.65 GFLOPS)
	(linear): Linear(88.5 M = 1.1% Params, 1.42 GMACs = 0.01% MACs, 228.17 us = 0.1% latency, 12.41 TFLOPS, in_features=3840, out_features=23040, bias=True)
	(norm): GemmaRMSNorm(3.84 K = 0% Params, 0 MACs = 0% MACs, 435.35 us = 0.19% latency, 0 FLOPS)
	)
	(self_attn): DiTSelfAttention(
	44.24 M = 0.55% Params, 197.3 GMACs = 0.9% MACs, 2.91 ms = 1.27% latency, 135.51 TFLOPS
	(q_norm): GemmaRMSNorm(120 = 0% Params, 0 MACs = 0% MACs, 441.79 us = 0.19% latency, 0 FLOPS)
	(k_norm): GemmaRMSNorm(120 = 0% Params, 0 MACs = 0% MACs, 228.88 us = 0.1% latency, 0 FLOPS)
	(q_proj): Linear(14.75 M = 0.18% Params, 60.4 GMACs = 0.28% MACs, 320.43 us = 0.14% latency, 376.98 TFLOPS, in_features=3840, out_features=3840, bias=False)
	(k_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 193.12 us = 0.08% latency, 156.37 TFLOPS, in_features=3840, out_features=960, bias=False)
	(v_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 190.5 us = 0.08% latency, 158.53 TFLOPS, in_features=3840, out_features=960, bias=False)
	(text_k_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 152.11 us = 0.07% latency, 198.53 TFLOPS, in_features=3840, out_features=960, bias=False)
	(text_v_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 150.2 us = 0.07% latency, 201.05 TFLOPS, in_features=3840, out_features=960, bias=False)
	(o_proj): Linear(14.75 M = 0.18% Params, 60.4 GMACs = 0.28% MACs, 311.14 us = 0.14% latency, 388.24 TFLOPS, in_features=3840, out_features=3840, bias=False)
	)
	(post_attention_layernorm): GemmaRMSNorm(3.84 K = 0% Params, 0 MACs = 0% MACs, 430.11 us = 0.19% latency, 0 FLOPS)
	(mlp): GemmaMLP(
	117.96 M = 1.46% Params, 483.18 GMACs = 2.21% MACs, 2.01 ms = 0.87% latency, 480.72 TFLOPS
	(gate_proj): Linear(39.32 M = 0.49% Params, 161.06 GMACs = 0.74% MACs, 581.5 us = 0.25% latency, 553.95 TFLOPS, in_features=3840, out_features=10240, bias=False)
	(up_proj): Linear(39.32 M = 0.49% Params, 161.06 GMACs = 0.74% MACs, 580.07 us = 0.25% latency, 555.31 TFLOPS, in_features=3840, out_features=10240, bias=False)
	(down_proj): Linear(39.32 M = 0.49% Params, 161.06 GMACs = 0.74% MACs, 550.51 us = 0.24% latency, 585.14 TFLOPS, in_features=10240, out_features=3840, bias=False)
	(act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 95.84 us = 0.04% latency, 437.62 GFLOPS)
	)
	)
	(24): DiTLayer(
	250.71 M = 3.1% Params, 681.9 GMACs = 3.12% MACs, 6.88 ms = 2.99% latency, 198.17 TFLOPS
	(input_layernorm): AdaLayerNormZero(
	88.5 M = 1.1% Params, 1.42 GMACs = 0.01% MACs, 910.28 us = 0.4% latency, 3.11 TFLOPS
	(silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 40.29 us = 0.02% latency, 1.52 GFLOPS)
	(linear): Linear(88.5 M = 1.1% Params, 1.42 GMACs = 0.01% MACs, 235.08 us = 0.1% latency, 12.04 TFLOPS, in_features=3840, out_features=23040, bias=True)
	(norm): GemmaRMSNorm(3.84 K = 0% Params, 0 MACs = 0% MACs, 432.25 us = 0.19% latency, 0 FLOPS)
	)
	(self_attn): DiTSelfAttention(
	44.24 M = 0.55% Params, 197.3 GMACs = 0.9% MACs, 2.91 ms = 1.27% latency, 135.55 TFLOPS
	(q_norm): GemmaRMSNorm(120 = 0% Params, 0 MACs = 0% MACs, 443.22 us = 0.19% latency, 0 FLOPS)
	(k_norm): GemmaRMSNorm(120 = 0% Params, 0 MACs = 0% MACs, 226.74 us = 0.1% latency, 0 FLOPS)
	(q_proj): Linear(14.75 M = 0.18% Params, 60.4 GMACs = 0.28% MACs, 321.63 us = 0.14% latency, 375.58 TFLOPS, in_features=3840, out_features=3840, bias=False)
	(k_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 193.36 us = 0.08% latency, 156.18 TFLOPS, in_features=3840, out_features=960, bias=False)
	(v_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 190.02 us = 0.08% latency, 158.93 TFLOPS, in_features=3840, out_features=960, bias=False)
	(text_k_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 152.35 us = 0.07% latency, 198.22 TFLOPS, in_features=3840, out_features=960, bias=False)
	(text_v_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 152.11 us = 0.07% latency, 198.53 TFLOPS, in_features=3840, out_features=960, bias=False)
	(o_proj): Linear(14.75 M = 0.18% Params, 60.4 GMACs = 0.28% MACs, 304.94 us = 0.13% latency, 396.13 TFLOPS, in_features=3840, out_features=3840, bias=False)
	)
	(post_attention_layernorm): GemmaRMSNorm(3.84 K = 0% Params, 0 MACs = 0% MACs, 427.96 us = 0.19% latency, 0 FLOPS)
	(mlp): GemmaMLP(
	117.96 M = 1.46% Params, 483.18 GMACs = 2.21% MACs, 2.01 ms = 0.87% latency, 480.95 TFLOPS
	(gate_proj): Linear(39.32 M = 0.49% Params, 161.06 GMACs = 0.74% MACs, 589.37 us = 0.26% latency, 546.55 TFLOPS, in_features=3840, out_features=10240, bias=False)
	(up_proj): Linear(39.32 M = 0.49% Params, 161.06 GMACs = 0.74% MACs, 585.32 us = 0.25% latency, 550.34 TFLOPS, in_features=3840, out_features=10240, bias=False)
	(down_proj): Linear(39.32 M = 0.49% Params, 161.06 GMACs = 0.74% MACs, 545.02 us = 0.24% latency, 591.02 TFLOPS, in_features=10240, out_features=3840, bias=False)
	(act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 95.84 us = 0.04% latency, 437.62 GFLOPS)
	)
	)
	(25): DiTLayer(
	250.71 M = 3.1% Params, 681.9 GMACs = 3.12% MACs, 6.9 ms = 3% latency, 197.54 TFLOPS
	(input_layernorm): AdaLayerNormZero(
	88.5 M = 1.1% Params, 1.42 GMACs = 0.01% MACs, 916.96 us = 0.4% latency, 3.09 TFLOPS
	(silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 38.15 us = 0.02% latency, 1.61 GFLOPS)
	(linear): Linear(88.5 M = 1.1% Params, 1.42 GMACs = 0.01% MACs, 240.09 us = 0.1% latency, 11.79 TFLOPS, in_features=3840, out_features=23040, bias=True)
	(norm): GemmaRMSNorm(3.84 K = 0% Params, 0 MACs = 0% MACs, 434.64 us = 0.19% latency, 0 FLOPS)
	)
	(self_attn): DiTSelfAttention(
	44.24 M = 0.55% Params, 197.3 GMACs = 0.9% MACs, 2.92 ms = 1.27% latency, 135.19 TFLOPS
	(q_norm): GemmaRMSNorm(120 = 0% Params, 0 MACs = 0% MACs, 443.46 us = 0.19% latency, 0 FLOPS)
	(k_norm): GemmaRMSNorm(120 = 0% Params, 0 MACs = 0% MACs, 226.02 us = 0.1% latency, 0 FLOPS)
	(q_proj): Linear(14.75 M = 0.18% Params, 60.4 GMACs = 0.28% MACs, 329.97 us = 0.14% latency, 366.08 TFLOPS, in_features=3840, out_features=3840, bias=False)
	(k_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 194.07 us = 0.08% latency, 155.61 TFLOPS, in_features=3840, out_features=960, bias=False)
	(v_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 188.35 us = 0.08% latency, 160.33 TFLOPS, in_features=3840, out_features=960, bias=False)
	(text_k_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 153.06 us = 0.07% latency, 197.3 TFLOPS, in_features=3840, out_features=960, bias=False)
	(text_v_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 150.2 us = 0.07% latency, 201.05 TFLOPS, in_features=3840, out_features=960, bias=False)
	(o_proj): Linear(14.75 M = 0.18% Params, 60.4 GMACs = 0.28% MACs, 307.08 us = 0.13% latency, 393.37 TFLOPS, in_features=3840, out_features=3840, bias=False)
	)
	(post_attention_layernorm): GemmaRMSNorm(3.84 K = 0% Params, 0 MACs = 0% MACs, 430.82 us = 0.19% latency, 0 FLOPS)
	(mlp): GemmaMLP(
	117.96 M = 1.46% Params, 483.18 GMACs = 2.21% MACs, 2.02 ms = 0.88% latency, 479.58 TFLOPS
	(gate_proj): Linear(39.32 M = 0.49% Params, 161.06 GMACs = 0.74% MACs, 588.89 us = 0.26% latency, 547 TFLOPS, in_features=3840, out_features=10240, bias=False)
	(up_proj): Linear(39.32 M = 0.49% Params, 161.06 GMACs = 0.74% MACs, 586.75 us = 0.26% latency, 549 TFLOPS, in_features=3840, out_features=10240, bias=False)
	(down_proj): Linear(39.32 M = 0.49% Params, 161.06 GMACs = 0.74% MACs, 545.98 us = 0.24% latency, 589.99 TFLOPS, in_features=10240, out_features=3840, bias=False)
	(act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 96.8 us = 0.04% latency, 433.31 GFLOPS)
	)
	)
	(26): DiTLayer(
	250.71 M = 3.1% Params, 681.9 GMACs = 3.12% MACs, 6.86 ms = 2.98% latency, 198.92 TFLOPS
	(input_layernorm): AdaLayerNormZero(
	88.5 M = 1.1% Params, 1.42 GMACs = 0.01% MACs, 895.26 us = 0.39% latency, 3.16 TFLOPS
	(silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 37.19 us = 0.02% latency, 1.65 GFLOPS)
	(linear): Linear(88.5 M = 1.1% Params, 1.42 GMACs = 0.01% MACs, 224.35 us = 0.1% latency, 12.62 TFLOPS, in_features=3840, out_features=23040, bias=True)
	(norm): GemmaRMSNorm(3.84 K = 0% Params, 0 MACs = 0% MACs, 433.92 us = 0.19% latency, 0 FLOPS)
	)
	(self_attn): DiTSelfAttention(
	44.24 M = 0.55% Params, 197.3 GMACs = 0.9% MACs, 2.9 ms = 1.26% latency, 136.05 TFLOPS
	(q_norm): GemmaRMSNorm(120 = 0% Params, 0 MACs = 0% MACs, 442.74 us = 0.19% latency, 0 FLOPS)
	(k_norm): GemmaRMSNorm(120 = 0% Params, 0 MACs = 0% MACs, 226.02 us = 0.1% latency, 0 FLOPS)
	(q_proj): Linear(14.75 M = 0.18% Params, 60.4 GMACs = 0.28% MACs, 319.48 us = 0.14% latency, 378.1 TFLOPS, in_features=3840, out_features=3840, bias=False)
	(k_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 191.21 us = 0.08% latency, 157.93 TFLOPS, in_features=3840, out_features=960, bias=False)
	(v_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 188.83 us = 0.08% latency, 159.93 TFLOPS, in_features=3840, out_features=960, bias=False)
	(text_k_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 152.59 us = 0.07% latency, 197.91 TFLOPS, in_features=3840, out_features=960, bias=False)
	(text_v_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 154.73 us = 0.07% latency, 195.17 TFLOPS, in_features=3840, out_features=960, bias=False)
	(o_proj): Linear(14.75 M = 0.18% Params, 60.4 GMACs = 0.28% MACs, 304.7 us = 0.13% latency, 396.44 TFLOPS, in_features=3840, out_features=3840, bias=False)
	)
	(post_attention_layernorm): GemmaRMSNorm(3.84 K = 0% Params, 0 MACs = 0% MACs, 429.87 us = 0.19% latency, 0 FLOPS)
	(mlp): GemmaMLP(
	117.96 M = 1.46% Params, 483.18 GMACs = 2.21% MACs, 2.02 ms = 0.88% latency, 478.56 TFLOPS
	(gate_proj): Linear(39.32 M = 0.49% Params, 161.06 GMACs = 0.74% MACs, 591.28 us = 0.26% latency, 544.79 TFLOPS, in_features=3840, out_features=10240, bias=False)
	(up_proj): Linear(39.32 M = 0.49% Params, 161.06 GMACs = 0.74% MACs, 589.13 us = 0.26% latency, 546.77 TFLOPS, in_features=3840, out_features=10240, bias=False)
	(down_proj): Linear(39.32 M = 0.49% Params, 161.06 GMACs = 0.74% MACs, 552.18 us = 0.24% latency, 583.37 TFLOPS, in_features=10240, out_features=3840, bias=False)
	(act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 94.41 us = 0.04% latency, 444.25 GFLOPS)
	)
	)
	(27): DiTLayer(
	250.71 M = 3.1% Params, 681.9 GMACs = 3.12% MACs, 7.09 ms = 3.08% latency, 192.33 TFLOPS
	(input_layernorm): AdaLayerNormZero(
	88.5 M = 1.1% Params, 1.42 GMACs = 0.01% MACs, 1.04 ms = 0.45% latency, 2.71 TFLOPS
	(silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 35.29 us = 0.02% latency, 1.74 GFLOPS)
	(linear): Linear(88.5 M = 1.1% Params, 1.42 GMACs = 0.01% MACs, 222.92 us = 0.1% latency, 12.7 TFLOPS, in_features=3840, out_features=23040, bias=True)
	(norm): GemmaRMSNorm(3.84 K = 0% Params, 0 MACs = 0% MACs, 578.64 us = 0.25% latency, 0 FLOPS)
	)
	(self_attn): DiTSelfAttention(
	44.24 M = 0.55% Params, 197.3 GMACs = 0.9% MACs, 2.98 ms = 1.29% latency, 132.56 TFLOPS
	(q_norm): GemmaRMSNorm(120 = 0% Params, 0 MACs = 0% MACs, 441.55 us = 0.19% latency, 0 FLOPS)
	(k_norm): GemmaRMSNorm(120 = 0% Params, 0 MACs = 0% MACs, 229.84 us = 0.1% latency, 0 FLOPS)
	(q_proj): Linear(14.75 M = 0.18% Params, 60.4 GMACs = 0.28% MACs, 346.66 us = 0.15% latency, 348.46 TFLOPS, in_features=3840, out_features=3840, bias=False)
	(k_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 195.74 us = 0.09% latency, 154.28 TFLOPS, in_features=3840, out_features=960, bias=False)
	(v_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 192.17 us = 0.08% latency, 157.15 TFLOPS, in_features=3840, out_features=960, bias=False)
	(text_k_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 155.69 us = 0.07% latency, 193.97 TFLOPS, in_features=3840, out_features=960, bias=False)
	(text_v_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 151.63 us = 0.07% latency, 199.16 TFLOPS, in_features=3840, out_features=960, bias=False)
	(o_proj): Linear(14.75 M = 0.18% Params, 60.4 GMACs = 0.28% MACs, 312.57 us = 0.14% latency, 386.46 TFLOPS, in_features=3840, out_features=3840, bias=False)
	)
	(post_attention_layernorm): GemmaRMSNorm(3.84 K = 0% Params, 0 MACs = 0% MACs, 430.82 us = 0.19% latency, 0 FLOPS)
	(mlp): GemmaMLP(
	117.96 M = 1.46% Params, 483.18 GMACs = 2.21% MACs, 2.01 ms = 0.87% latency, 481.69 TFLOPS
	(gate_proj): Linear(39.32 M = 0.49% Params, 161.06 GMACs = 0.74% MACs, 587.22 us = 0.26% latency, 548.55 TFLOPS, in_features=3840, out_features=10240, bias=False)
	(up_proj): Linear(39.32 M = 0.49% Params, 161.06 GMACs = 0.74% MACs, 583.41 us = 0.25% latency, 552.14 TFLOPS, in_features=3840, out_features=10240, bias=False)
	(down_proj): Linear(39.32 M = 0.49% Params, 161.06 GMACs = 0.74% MACs, 540.02 us = 0.23% latency, 596.5 TFLOPS, in_features=10240, out_features=3840, bias=False)
	(act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 98.94 us = 0.04% latency, 423.91 GFLOPS)
	)
	)
	(28): DiTLayer(
	250.71 M = 3.1% Params, 681.9 GMACs = 3.12% MACs, 6.84 ms = 2.97% latency, 199.45 TFLOPS
	(input_layernorm): AdaLayerNormZero(
	88.5 M = 1.1% Params, 1.42 GMACs = 0.01% MACs, 885.01 us = 0.38% latency, 3.2 TFLOPS
	(silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 34.81 us = 0.02% latency, 1.77 GFLOPS)
	(linear): Linear(88.5 M = 1.1% Params, 1.42 GMACs = 0.01% MACs, 219.58 us = 0.1% latency, 12.89 TFLOPS, in_features=3840, out_features=23040, bias=True)
	(norm): GemmaRMSNorm(3.84 K = 0% Params, 0 MACs = 0% MACs, 433.92 us = 0.19% latency, 0 FLOPS)
	)
	(self_attn): DiTSelfAttention(
	44.24 M = 0.55% Params, 197.3 GMACs = 0.9% MACs, 2.9 ms = 1.26% latency, 136.12 TFLOPS
	(q_norm): GemmaRMSNorm(120 = 0% Params, 0 MACs = 0% MACs, 442.74 us = 0.19% latency, 0 FLOPS)
	(k_norm): GemmaRMSNorm(120 = 0% Params, 0 MACs = 0% MACs, 227.45 us = 0.1% latency, 0 FLOPS)
	(q_proj): Linear(14.75 M = 0.18% Params, 60.4 GMACs = 0.28% MACs, 319.24 us = 0.14% latency, 378.38 TFLOPS, in_features=3840, out_features=3840, bias=False)
	(k_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 190.97 us = 0.08% latency, 158.13 TFLOPS, in_features=3840, out_features=960, bias=False)
	(v_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 188.83 us = 0.08% latency, 159.93 TFLOPS, in_features=3840, out_features=960, bias=False)
	(text_k_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 151.87 us = 0.07% latency, 198.84 TFLOPS, in_features=3840, out_features=960, bias=False)
	(text_v_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 150.2 us = 0.07% latency, 201.05 TFLOPS, in_features=3840, out_features=960, bias=False)
	(o_proj): Linear(14.75 M = 0.18% Params, 60.4 GMACs = 0.28% MACs, 306.61 us = 0.13% latency, 393.98 TFLOPS, in_features=3840, out_features=3840, bias=False)
	)
	(post_attention_layernorm): GemmaRMSNorm(3.84 K = 0% Params, 0 MACs = 0% MACs, 429.39 us = 0.19% latency, 0 FLOPS)
	(mlp): GemmaMLP(
	117.96 M = 1.46% Params, 483.18 GMACs = 2.21% MACs, 2.01 ms = 0.87% latency, 480.49 TFLOPS
	(gate_proj): Linear(39.32 M = 0.49% Params, 161.06 GMACs = 0.74% MACs, 593.19 us = 0.26% latency, 543.04 TFLOPS, in_features=3840, out_features=10240, bias=False)
	(up_proj): Linear(39.32 M = 0.49% Params, 161.06 GMACs = 0.74% MACs, 588.42 us = 0.26% latency, 547.44 TFLOPS, in_features=3840, out_features=10240, bias=False)
	(down_proj): Linear(39.32 M = 0.49% Params, 161.06 GMACs = 0.74% MACs, 543.83 us = 0.24% latency, 592.32 TFLOPS, in_features=10240, out_features=3840, bias=False)
	(act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 93.7 us = 0.04% latency, 447.64 GFLOPS)
	)
	)
	(29): DiTLayer(
	250.71 M = 3.1% Params, 681.9 GMACs = 3.12% MACs, 6.81 ms = 2.96% latency, 200.15 TFLOPS
	(input_layernorm): AdaLayerNormZero(
	88.5 M = 1.1% Params, 1.42 GMACs = 0.01% MACs, 886.2 us = 0.39% latency, 3.19 TFLOPS
	(silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 35.29 us = 0.02% latency, 1.74 GFLOPS)
	(linear): Linear(88.5 M = 1.1% Params, 1.42 GMACs = 0.01% MACs, 222.44 us = 0.1% latency, 12.73 TFLOPS, in_features=3840, out_features=23040, bias=True)
	(norm): GemmaRMSNorm(3.84 K = 0% Params, 0 MACs = 0% MACs, 432.73 us = 0.19% latency, 0 FLOPS)
	)
	(self_attn): DiTSelfAttention(
	44.24 M = 0.55% Params, 197.3 GMACs = 0.9% MACs, 2.89 ms = 1.26% latency, 136.52 TFLOPS
	(q_norm): GemmaRMSNorm(120 = 0% Params, 0 MACs = 0% MACs, 441.31 us = 0.19% latency, 0 FLOPS)
	(k_norm): GemmaRMSNorm(120 = 0% Params, 0 MACs = 0% MACs, 226.97 us = 0.1% latency, 0 FLOPS)
	(q_proj): Linear(14.75 M = 0.18% Params, 60.4 GMACs = 0.28% MACs, 318.77 us = 0.14% latency, 378.95 TFLOPS, in_features=3840, out_features=3840, bias=False)
	(k_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 191.21 us = 0.08% latency, 157.93 TFLOPS, in_features=3840, out_features=960, bias=False)
	(v_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 188.83 us = 0.08% latency, 159.93 TFLOPS, in_features=3840, out_features=960, bias=False)
	(text_k_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 150.68 us = 0.07% latency, 200.42 TFLOPS, in_features=3840, out_features=960, bias=False)
	(text_v_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 149.73 us = 0.07% latency, 201.69 TFLOPS, in_features=3840, out_features=960, bias=False)
	(o_proj): Linear(14.75 M = 0.18% Params, 60.4 GMACs = 0.28% MACs, 304.7 us = 0.13% latency, 396.44 TFLOPS, in_features=3840, out_features=3840, bias=False)
	)
	(post_attention_layernorm): GemmaRMSNorm(3.84 K = 0% Params, 0 MACs = 0% MACs, 428.2 us = 0.19% latency, 0 FLOPS)
	(mlp): GemmaMLP(
	117.96 M = 1.46% Params, 483.18 GMACs = 2.21% MACs, 2 ms = 0.87% latency, 484.11 TFLOPS
	(gate_proj): Linear(39.32 M = 0.49% Params, 161.06 GMACs = 0.74% MACs, 586.99 us = 0.26% latency, 548.77 TFLOPS, in_features=3840, out_features=10240, bias=False)
	(up_proj): Linear(39.32 M = 0.49% Params, 161.06 GMACs = 0.74% MACs, 584.6 us = 0.25% latency, 551.01 TFLOPS, in_features=3840, out_features=10240, bias=False)
	(down_proj): Linear(39.32 M = 0.49% Params, 161.06 GMACs = 0.74% MACs, 540.49 us = 0.24% latency, 595.98 TFLOPS, in_features=10240, out_features=3840, bias=False)
	(act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 94.41 us = 0.04% latency, 444.25 GFLOPS)
	)
	)
	(30): DiTLayer(
	250.71 M = 3.1% Params, 681.9 GMACs = 3.12% MACs, 6.83 ms = 2.97% latency, 199.78 TFLOPS
	(input_layernorm): AdaLayerNormZero(
	88.5 M = 1.1% Params, 1.42 GMACs = 0.01% MACs, 884.06 us = 0.38% latency, 3.2 TFLOPS
	(silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 34.33 us = 0.01% latency, 1.79 GFLOPS)
	(linear): Linear(88.5 M = 1.1% Params, 1.42 GMACs = 0.01% MACs, 221.25 us = 0.1% latency, 12.8 TFLOPS, in_features=3840, out_features=23040, bias=True)
	(norm): GemmaRMSNorm(3.84 K = 0% Params, 0 MACs = 0% MACs, 433.68 us = 0.19% latency, 0 FLOPS)
	)
	(self_attn): DiTSelfAttention(
	44.24 M = 0.55% Params, 197.3 GMACs = 0.9% MACs, 2.89 ms = 1.26% latency, 136.51 TFLOPS
	(q_norm): GemmaRMSNorm(120 = 0% Params, 0 MACs = 0% MACs, 441.79 us = 0.19% latency, 0 FLOPS)
	(k_norm): GemmaRMSNorm(120 = 0% Params, 0 MACs = 0% MACs, 225.54 us = 0.1% latency, 0 FLOPS)
	(q_proj): Linear(14.75 M = 0.18% Params, 60.4 GMACs = 0.28% MACs, 319 us = 0.14% latency, 378.67 TFLOPS, in_features=3840, out_features=3840, bias=False)
	(k_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 190.5 us = 0.08% latency, 158.53 TFLOPS, in_features=3840, out_features=960, bias=False)
	(v_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 189.54 us = 0.08% latency, 159.33 TFLOPS, in_features=3840, out_features=960, bias=False)
	(text_k_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 153.3 us = 0.07% latency, 196.99 TFLOPS, in_features=3840, out_features=960, bias=False)
	(text_v_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 151.4 us = 0.07% latency, 199.47 TFLOPS, in_features=3840, out_features=960, bias=False)
	(o_proj): Linear(14.75 M = 0.18% Params, 60.4 GMACs = 0.28% MACs, 303.03 us = 0.13% latency, 398.63 TFLOPS, in_features=3840, out_features=3840, bias=False)
	)
	(post_attention_layernorm): GemmaRMSNorm(3.84 K = 0% Params, 0 MACs = 0% MACs, 429.15 us = 0.19% latency, 0 FLOPS)
	(mlp): GemmaMLP(
	117.96 M = 1.46% Params, 483.18 GMACs = 2.21% MACs, 2.01 ms = 0.88% latency, 480.38 TFLOPS
	(gate_proj): Linear(39.32 M = 0.49% Params, 161.06 GMACs = 0.74% MACs, 590.32 us = 0.26% latency, 545.67 TFLOPS, in_features=3840, out_features=10240, bias=False)
	(up_proj): Linear(39.32 M = 0.49% Params, 161.06 GMACs = 0.74% MACs, 588.18 us = 0.26% latency, 547.66 TFLOPS, in_features=3840, out_features=10240, bias=False)
	(down_proj): Linear(39.32 M = 0.49% Params, 161.06 GMACs = 0.74% MACs, 545.5 us = 0.24% latency, 590.51 TFLOPS, in_features=10240, out_features=3840, bias=False)
	(act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 94.65 us = 0.04% latency, 443.13 GFLOPS)
	)
	)
	(31): DiTLayer(
	250.71 M = 3.1% Params, 681.9 GMACs = 3.12% MACs, 6.82 ms = 2.97% latency, 199.9 TFLOPS
	(input_layernorm): AdaLayerNormZero(
	88.5 M = 1.1% Params, 1.42 GMACs = 0.01% MACs, 895.5 us = 0.39% latency, 3.16 TFLOPS
	(silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 36.48 us = 0.02% latency, 1.68 GFLOPS)
	(linear): Linear(88.5 M = 1.1% Params, 1.42 GMACs = 0.01% MACs, 226.26 us = 0.1% latency, 12.51 TFLOPS, in_features=3840, out_features=23040, bias=True)
	(norm): GemmaRMSNorm(3.84 K = 0% Params, 0 MACs = 0% MACs, 434.16 us = 0.19% latency, 0 FLOPS)
	)
	(self_attn): DiTSelfAttention(
	44.24 M = 0.55% Params, 197.3 GMACs = 0.9% MACs, 2.89 ms = 1.26% latency, 136.31 TFLOPS
	(q_norm): GemmaRMSNorm(120 = 0% Params, 0 MACs = 0% MACs, 440.84 us = 0.19% latency, 0 FLOPS)
	(k_norm): GemmaRMSNorm(120 = 0% Params, 0 MACs = 0% MACs, 233.41 us = 0.1% latency, 0 FLOPS)
	(q_proj): Linear(14.75 M = 0.18% Params, 60.4 GMACs = 0.28% MACs, 319.96 us = 0.14% latency, 377.54 TFLOPS, in_features=3840, out_features=3840, bias=False)
	(k_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 190.97 us = 0.08% latency, 158.13 TFLOPS, in_features=3840, out_features=960, bias=False)
	(v_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 189.3 us = 0.08% latency, 159.53 TFLOPS, in_features=3840, out_features=960, bias=False)
	(text_k_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 152.11 us = 0.07% latency, 198.53 TFLOPS, in_features=3840, out_features=960, bias=False)
	(text_v_proj): Linear(3.69 M = 0.05% Params, 15.1 GMACs = 0.07% MACs, 150.2 us = 0.07% latency, 201.05 TFLOPS, in_features=3840, out_features=960, bias=False)
	(o_proj): Linear(14.75 M = 0.18% Params, 60.4 GMACs = 0.28% MACs, 303.03 us = 0.13% latency, 398.63 TFLOPS, in_features=3840, out_features=3840, bias=False)
	)
	(post_attention_layernorm): GemmaRMSNorm(3.84 K = 0% Params, 0 MACs = 0% MACs, 429.63 us = 0.19% latency, 0 FLOPS)
	(mlp): GemmaMLP(
	117.96 M = 1.46% Params, 483.18 GMACs = 2.21% MACs, 1.99 ms = 0.87% latency, 485.85 TFLOPS
	(gate_proj): Linear(39.32 M = 0.49% Params, 161.06 GMACs = 0.74% MACs, 583.65 us = 0.25% latency, 551.91 TFLOPS, in_features=3840, out_features=10240, bias=False)
	(up_proj): Linear(39.32 M = 0.49% Params, 161.06 GMACs = 0.74% MACs, 581.98 us = 0.25% latency, 553.49 TFLOPS, in_features=3840, out_features=10240, bias=False)
	(down_proj): Linear(39.32 M = 0.49% Params, 161.06 GMACs = 0.74% MACs, 538.59 us = 0.23% latency, 598.09 TFLOPS, in_features=10240, out_features=3840, bias=False)
	(act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 94.65 us = 0.04% latency, 443.13 GFLOPS)
	)
	)
	)
	(patch_embed): PatchEmbed(
	249.6 K = 0% Params, 1.01 GMACs = 0% MACs, 595.33 us = 0.26% latency, 3.41 TFLOPS
	(proj): Conv2d(249.6 K = 0% Params, 1.01 GMACs = 0% MACs, 367.64 us = 0.16% latency, 5.52 TFLOPS, 16, 3840, kernel_size=(2, 2), stride=(2, 2))
	)
	(rotary_emb): GemmaRotaryEmbedding(0 = 0% Params, 0 MACs = 0% MACs, 0 s = 0% latency, 0 FLOPS)
	(time_proj): Timesteps(0 = 0% Params, 0 MACs = 0% MACs, 239.13 us = 0.1% latency, 0 FLOPS)
	(timestep_embedder): Sequential(
	15.74 M = 0.19% Params, 251.66 MMACs = 0% MACs, 510.93 us = 0.22% latency, 985.22 GFLOPS
	(0): Linear(986.88 K = 0.01% Params, 15.73 MMACs = 0% MACs, 216.96 us = 0.09% latency, 144.99 GFLOPS, in_features=256, out_features=3840, bias=True)
	(1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 44.82 us = 0.02% latency, 1.37 GFLOPS)
	(2): Linear(14.75 M = 0.18% Params, 235.93 MMACs = 0% MACs, 177.38 us = 0.08% latency, 2.66 TFLOPS, in_features=3840, out_features=3840, bias=True)
	)
	(context_embedder): Sequential(
	7.87 M = 0.1% Params, 32.21 GMACs = 0.15% MACs, 479.94 us = 0.21% latency, 134.24 TFLOPS
	(0): GemmaRMSNorm(2.05 K = 0% Params, 0 MACs = 0% MACs, 177.38 us = 0.08% latency, 0 FLOPS)
	(1): Linear(7.87 M = 0.1% Params, 32.21 GMACs = 0.15% MACs, 250.82 us = 0.11% latency, 256.86 TFLOPS, in_features=2048, out_features=3840, bias=True)
	)
	(norm_out): AdaLayerNormOut(
	29.5 M = 0.37% Params, 471.86 MMACs = 0% MACs, 836.61 us = 0.36% latency, 1.13 TFLOPS
	(silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 36.72 us = 0.02% latency, 1.67 GFLOPS)
	(linear): Linear(29.5 M = 0.37% Params, 471.86 MMACs = 0% MACs, 166.89 us = 0.07% latency, 5.65 TFLOPS, in_features=3840, out_features=7680, bias=True)
	(norm): GemmaRMSNorm(3.84 K = 0% Params, 0 MACs = 0% MACs, 430.82 us = 0.19% latency, 0 FLOPS)
	)
	(proj_out): Linear(245.82 K = 0% Params, 1.01 GMACs = 0% MACs, 174.05 us = 0.08% latency, 11.57 TFLOPS, in_features=3840, out_features=64, bias=True)
	)
	------------------------------------------------------------------------------