xingjianleng commited on 27 days ago

Commit

41446d2

verified ·

1 Parent(s): 2ffca04

Add files using upload-large-folder tool

Browse files

Files changed (50) hide show

100000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +3 -0
100000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +3 -0
100000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +3 -0
100000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +3 -0
100000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +3 -0
100000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +3 -0
100000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +3 -0
100000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3 -0
100000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3 -0
100000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +3 -0
100000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3 -0
100000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3 -0
100000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +3 -0
100000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +3 -0
100000/rng-0.ckpt +3 -0
100000/rng-1.ckpt +3 -0
100000/rng-10.ckpt +3 -0
100000/rng-11.ckpt +3 -0
100000/rng-12.ckpt +3 -0
100000/rng-13.ckpt +3 -0
100000/rng-14.ckpt +3 -0
100000/rng-15.ckpt +3 -0
100000/rng-16.ckpt +3 -0
100000/rng-17.ckpt +3 -0
100000/rng-18.ckpt +3 -0
100000/rng-19.ckpt +3 -0
100000/rng-2.ckpt +3 -0
100000/rng-20.ckpt +3 -0
100000/rng-21.ckpt +3 -0
100000/rng-22.ckpt +3 -0
100000/rng-23.ckpt +3 -0
100000/rng-24.ckpt +3 -0
100000/rng-25.ckpt +3 -0
100000/rng-26.ckpt +3 -0
100000/rng-27.ckpt +3 -0
100000/rng-28.ckpt +3 -0
100000/rng-29.ckpt +3 -0
100000/rng-3.ckpt +3 -0
100000/rng-30.ckpt +3 -0
100000/rng-31.ckpt +3 -0
100000/rng-4.ckpt +3 -0
100000/rng-5.ckpt +3 -0
100000/rng-6.ckpt +3 -0
100000/rng-7.ckpt +3 -0
100000/rng-8.ckpt +3 -0
100000/rng-9.ckpt +3 -0
config.json +107 -0
flops.txt +990 -0
latest +1 -0
zero_to_fp32.py +604 -0

100000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:632d0072de3cb67b1ae2516fede170e15e773045f7d457bee266a1515d9d32d3
+size 3452851031

100000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:47a25143d26054b6f3e468abbd1802e66fedd3e3f47516d3f972a08022b2adb9
+size 3452851031

100000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fbfc55fd70be306fc0b1573f29cfa9364d16750d5fde79c5a87fabfb9d775242
+size 3452851031

100000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2d862d07fcceaf5fcffc931dfb84a11900a39cc8a63e1174f3846dee5f5490de
+size 3452851031

100000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:61f6e625882f114b5f8db78eadb49dbb96d70a0e7b59c9ae6b2ec2f14bdf516c
+size 3452851031

100000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:15fe9f91dc03b2996d368f72ef7ba4a2623df6739ecf00816c12303587d77cc4
+size 3452851031

100000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:989f6175aaae17f7ddc85aa9c09013103b7adc0a2141de425c7d66fcfe4ab724
+size 3452851031

100000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:75556779acf6cef99ddf7af465add5890ed5a1088257e70842d7d1446e8cb1e2
+size 3452851031

100000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0e9975ab00d0131d94c207af559ea8a066005748d55813470229e1a178793773
+size 3452851031

100000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d068e9c6d43ad6e3f6438a64d0035f13bb32ebe92805b180b3ecc59ad3408bee
+size 3452850960

100000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0d57042c75add8b19e506623d786252da5b4874d6799df3b48cb00a1b1701d85
+size 3452851031

100000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d32c14cde2f2a38f3afb97efc3b25ab366dc04fa9adf78b8e6ba0896b536cbaf
+size 3452851991

100000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1360e51e97d95f16669a2e8811ef6ed12d7b5e65c212de6d2a2cb54952141671
+size 3452850960

100000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:59dc05b998d7a0636bea85648e0ecdc47db54f2a645de29a6ef5286d22eca157
+size 3452850960

100000/rng-0.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8bb5b4727faf939c2791701613158408e130f58cad1382c138761ad169e98acc
+size 14906

100000/rng-1.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:916850147c8cb799a58cb714b2a6dbc74f80316bd9e06dfc8c404bc62a2d01a7
+size 14906

100000/rng-10.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:64ef9a5f41060dcc0bc5811fd3d3dd895023ae5bfc7889cff5ead763623251ab
+size 14915

100000/rng-11.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:21e4cb5e622d4cf9a7210cf825526406ebbebf95982a2e897fbc4e2c35eee52e
+size 14915

100000/rng-12.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:212b3009effa283891fa77dd98dabc78d0a30e536035ba686c49d558ab7fb809
+size 14915

100000/rng-13.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cf5cc6a4c24f492e903357192866578a7727b3ef122829f6b087382e56b31219
+size 14915

100000/rng-14.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:19071d6571957f5994add0eae66ca0adba5c0b914c74969f95c4f650f382d9c7
+size 14915

100000/rng-15.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:94b688b2ab22e5022af4fcc4bbe20ca2296906de249b108a06abe09feed0a84a
+size 14915

100000/rng-16.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f2444d3346cdc9e2e17a33c4fbbba8acc61d926542e2770c164a071a58c91ea0
+size 14915

100000/rng-17.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a94bce3f38427488db4125505c2ddbba197d4fb71877f0ef108beb397c96fdd5
+size 14915

100000/rng-18.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1e2b823888affc27a052073dd61ca8763970b080079b8d5ab0cc60419f653a8b
+size 14915

100000/rng-19.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:978ad2f51b5e37a9647da00f4d8b2f4a56a167f9258baa6c44f062e741b2fe89
+size 14915

100000/rng-2.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4b570083d8aab5cc274e4eeebd285f9ea4ac9f47bcba008c1cd9d0fc66bb318b
+size 14906

100000/rng-20.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:885394dbcd4f27920d425576cc32401e15507cfa31259c496ea57645e1ebe9d9
+size 14915

100000/rng-21.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bb0e47e1ae16b85cd4de5e20381a37706a7e895a0b506a515e9cfa3eb1c55b5f
+size 14915

100000/rng-22.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3abcc8137c26e19828a8389381972250fbbd119dc646785a06c8f59c2ba3b4fd
+size 14915

100000/rng-23.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7d471922bc8207f2cac22549e282e7104b99b80ac0c5607d8693a719861b4dd4
+size 14915

100000/rng-24.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e2e2ac11942ecdf990c54bcd488b3e6a07624a39f091d7e4e5ba4defa44be880
+size 14915

100000/rng-25.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:82ec4bd84d0764b0062c84c7ba04dd9307fb76a026b755952379f696f61c2aea
+size 14915

100000/rng-26.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3c488b8418ed9511d6a9a6696323b8e1a92fe55635ec99e3f75b4fd6139d56df
+size 14915

100000/rng-27.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b417db8a88bfebdd0f40af3b2b098cee8658f313b68d41d4b17cec90a64b2572
+size 14915

100000/rng-28.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:16178c33b8bfed85576174321d0d8f4f15f48514295c58f00e9cc35d1a43ae01
+size 14915

100000/rng-29.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2a35c2efce55dbee5a072f6913b42d2c9d680482b359b1d8d3985aa8935dac4d
+size 14915

100000/rng-3.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:35d092f725aed26e952cf049fe53f7e28e094362a966ed1a92659798bd37f7db
+size 14906

100000/rng-30.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ec623f54939453ff59cf1c6a3094bd9cc926e3b69decefc0169f508079d01051
+size 14915

100000/rng-31.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:402a431a8b78c557bf3169570493ad78d0bae37ce0c4eab1e9d7bbafcba06c92
+size 14915

100000/rng-4.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:236ddb166fbade54d5609adeccc6f57ab75b8acc3d04da10c4c0ee964942820e
+size 14906

100000/rng-5.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:054e766c0d083d3f1af7d4fdd8cffc3a4e365cf8ff23da3ea761584b112a48f2
+size 14906

100000/rng-6.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2ab96b7ee0eef3768d5bc14e3f8ba28861f67aa30be5b301354c7bfaac049dd2
+size 14906

100000/rng-7.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:961901a7e00841a38e8d397016a1457d77a47576caed01cd9d3ab4b2cfbf663a
+size 14906

100000/rng-8.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5fdf07a858c1592b7fad503b7a80964e94ea15194a27dfeffee76a5a512dd7d5
+size 14906

100000/rng-9.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:95cd4916247cb62f865b8205343c53ce65b86e7593db000d70e4e961e08fb86e
+size 14906

config.json ADDED Viewed

	@@ -0,0 +1,107 @@

+{
+  "attention": "self",
+  "base_config": {
+    "_name_or_path": "google/gemma-2b",
+    "add_cross_attention": false,
+    "architectures": [
+      "GemmaForCausalLM"
+    ],
+    "attention_bias": false,
+    "attention_dropout": 0.0,
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bos_token_id": 2,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "early_stopping": false,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": 1,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "head_dim": 128,
+    "hidden_act": "gelu",
+    "hidden_activation": "gelu_pytorch_tanh",
+    "hidden_size": 4096,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "initializer_range": 0.02,
+    "intermediate_size": 10936,
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "max_position_embeddings": 8192,
+    "min_length": 0,
+    "model_type": "gemma",
+    "no_repeat_ngram_size": 0,
+    "num_attention_heads": 32,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_hidden_layers": 18,
+    "num_key_value_heads": 8,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": 0,
+    "prefix": null,
+    "problem_type": null,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "rms_norm_eps": 1e-06,
+    "rope_scaling": null,
+    "rope_theta": 10000.0,
+    "sep_token_id": null,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": "bfloat16",
+    "torchscript": false,
+    "typical_p": 1.0,
+    "use_bfloat16": false,
+    "use_cache": true,
+    "vocab_size": 256000
+  },
+  "dit_hidden_size": 4096,
+  "dit_num_hidden_layers": 32,
+  "in_channels": 16,
+  "initial_layers": 0,
+  "model_type": "DiT",
+  "out_channels": 16,
+  "patch_size": 2,
+  "pos_embed": "ape",
+  "pos_embed_max_size": 64,
+  "qk_norm": true,
+  "repa_enable": true,
+  "repa_enc_depth": 8,
+  "repa_projector_dim": 2048,
+  "repa_z_dim": 768,
+  "sample_size": 32,
+  "sandwich_norm": false,
+  "shared_attention_layers": "all",
+  "text_hidden_size": 2048,
+  "text_hidden_states_index": -1,
+  "text_modulation_embeds_dim": null,
+  "timestep_conditioning": "adaln-zero",
+  "transformers_version": "4.43.3"
+}

flops.txt ADDED Viewed

	@@ -0,0 +1,990 @@

+-------------------------- DeepSpeed Flops Profiler --------------------------
+Profile Summary at step 2:
+Notations:
+data parallel size (dp_size), model parallel size(mp_size),
+number of parameters (params), number of multiply-accumulate operations(MACs),
+number of floating-point operations (flops), floating-point operations per second (FLOPS),
+fwd latency (forward propagation latency), bwd latency (backward propagation latency),
+step (weights update latency), iter latency (sum of fwd, bwd and step latency)
+world size:                                                             32
+data parallel size:                                                     32
+model parallel size:                                                    1
+batch size per GPU:                                                     16
+params per GPU:                                                         9.21 B
+params of model = params per GPU * mp_size:                             9.21 B
+fwd MACs per GPU:                                                       24.91 TMACs
+fwd flops per GPU:                                                      49.82 T
+fwd flops of model = fwd flops per GPU * mp_size:                       49.82 T
+fwd latency:                                                            246.3 ms
+fwd FLOPS per GPU = fwd flops per GPU / fwd latency:                    202.25 TFLOPS
+bwd latency:                                                            973.32 ms
+bwd FLOPS per GPU = 2 * fwd flops per GPU / bwd latency:                102.36 TFLOPS
+fwd+bwd FLOPS per GPU = 3 * fwd flops per GPU / (fwd+bwd latency):      122.54 TFLOPS
+step latency:                                                           441.01 ms
+iter latency:                                                           1.66 s
+FLOPS per GPU = 3 * fwd flops per GPU / iter latency:                   89.99 TFLOPS
+samples/second:                                                         308.32
+----------------------------- Aggregated Profile per GPU -----------------------------
+Top 1 modules in terms of params, MACs or fwd latency at different model depths:
+depth 0:
+    params      - {'DiT': '9.21 B'}
+    MACs        - {'DiT': '24.91 TMACs'}
+    fwd latency - {'DiT': '246.07 ms'}
+depth 1:
+    params      - {'ModuleList': '9.13 B'}
+    MACs        - {'ModuleList': '24.81 TMACs'}
+    fwd latency - {'ModuleList': '234.7 ms'}
+depth 2:
+    params      - {'DiTLayer': '9.13 B'}
+    MACs        - {'DiTLayer': '24.81 TMACs'}
+    fwd latency - {'DiTLayer': '234.7 ms'}
+depth 3:
+    params      - {'GemmaMLP': '4.3 B'}
+    MACs        - {'GemmaMLP': '17.61 TMACs'}
+    fwd latency - {'DiTSelfAttention': '96.91 ms'}
+------------------------------ Detailed Profile per GPU ------------------------------
+Each module profile is listed after its name in the following order:
+params, percentage of total params, MACs, percentage of total MACs, fwd latency, percentage of total fwd latency, fwd FLOPS
+Note: 1. A module can have torch.nn.module or torch.nn.functional to compute logits (e.g. CrossEntropyLoss). They are not counted as submodules, thus not to be printed out. However they make up the difference between a parent's MACs (or latency) and the sum of its submodules'.
+2. Number of floating-point operations is a theoretical estimation, thus FLOPS computed using that could be larger than the maximum system throughput.
+3. The fwd latency listed in the top module's profile is directly captured at the module forward function in PyTorch, thus it's less than the fwd latency shown above which is captured in DeepSpeed.
+DiT(
+  9.21 B = 100% Params, 24.91 TMACs = 100% MACs, 246.07 ms = 100% latency, 202.45 TFLOPS
+  (layers): ModuleList(
+    (0): DiTLayer(
+      285.41 M = 3.1% Params, 775.38 GMACs = 3.11% MACs, 7.34 ms = 2.98% latency, 211.41 TFLOPS
+      (input_layernorm): AdaLayerNormZero(
+        100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 991.11 us = 0.4% latency, 3.25 TFLOPS
+        (silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 39.82 us = 0.02% latency, 1.65 GFLOPS)
+        (linear): Linear(100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 270.13 us = 0.11% latency, 11.92 TFLOPS, in_features=4096, out_features=24576, bias=True)
+        (norm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 462.53 us = 0.19% latency, 0 FLOPS)
+      )
+      (self_attn): DiTSelfAttention(
+        50.33 M = 0.55% Params, 223.34 GMACs = 0.9% MACs, 3.05 ms = 1.24% latency, 146.31 TFLOPS
+        (q_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 463.96 us = 0.19% latency, 0 FLOPS)
+        (k_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 236.27 us = 0.1% latency, 0 FLOPS)
+        (q_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 361.44 us = 0.15% latency, 380.25 TFLOPS, in_features=4096, out_features=4096, bias=False)
+        (k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 199.32 us = 0.08% latency, 172.39 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 195.5 us = 0.08% latency, 175.75 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (text_k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 157.59 us = 0.06% latency, 218.03 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (text_v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 153.78 us = 0.06% latency, 223.43 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (o_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 354.53 us = 0.14% latency, 387.67 TFLOPS, in_features=4096, out_features=4096, bias=False)
+      )
+      (post_attention_layernorm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 457.76 us = 0.19% latency, 0 FLOPS)
+      (mlp): GemmaMLP(
+        134.38 M = 1.46% Params, 550.43 GMACs = 2.21% MACs, 2.18 ms = 0.88% latency, 505.97 TFLOPS
+        (gate_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 628.23 us = 0.26% latency, 584.1 TFLOPS, in_features=4096, out_features=10936, bias=False)
+        (up_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 624.9 us = 0.25% latency, 587.22 TFLOPS, in_features=4096, out_features=10936, bias=False)
+        (down_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 595.33 us = 0.24% latency, 616.38 TFLOPS, in_features=10936, out_features=4096, bias=False)
+        (act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 116.11 us = 0.05% latency, 385.79 GFLOPS)
+      )
+    )
+    (1): DiTLayer(
+      285.41 M = 3.1% Params, 775.38 GMACs = 3.11% MACs, 7.2 ms = 2.93% latency, 215.36 TFLOPS
+      (input_layernorm): AdaLayerNormZero(
+        100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 948.19 us = 0.39% latency, 3.4 TFLOPS
+        (silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 36.72 us = 0.01% latency, 1.78 GFLOPS)
+        (linear): Linear(100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 238.9 us = 0.1% latency, 13.48 TFLOPS, in_features=4096, out_features=24576, bias=True)
+        (norm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 461.1 us = 0.19% latency, 0 FLOPS)
+      )
+      (self_attn): DiTSelfAttention(
+        50.33 M = 0.55% Params, 223.34 GMACs = 0.9% MACs, 3 ms = 1.22% latency, 148.78 TFLOPS
+        (q_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 462.77 us = 0.19% latency, 0 FLOPS)
+        (k_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 235.32 us = 0.1% latency, 0 FLOPS)
+        (q_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 348.09 us = 0.14% latency, 394.84 TFLOPS, in_features=4096, out_features=4096, bias=False)
+        (k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 197.17 us = 0.08% latency, 174.26 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 194.55 us = 0.08% latency, 176.61 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (text_k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 157.83 us = 0.06% latency, 217.7 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (text_v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 154.5 us = 0.06% latency, 222.4 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (o_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 343.8 us = 0.14% latency, 399.76 TFLOPS, in_features=4096, out_features=4096, bias=False)
+      )
+      (post_attention_layernorm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 457.05 us = 0.19% latency, 0 FLOPS)
+      (mlp): GemmaMLP(
+        134.38 M = 1.46% Params, 550.43 GMACs = 2.21% MACs, 2.14 ms = 0.87% latency, 514.26 TFLOPS
+        (gate_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 627.04 us = 0.25% latency, 585.21 TFLOPS, in_features=4096, out_features=10936, bias=False)
+        (up_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 621.8 us = 0.25% latency, 590.15 TFLOPS, in_features=4096, out_features=10936, bias=False)
+        (down_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 587.7 us = 0.24% latency, 624.38 TFLOPS, in_features=10936, out_features=4096, bias=False)
+        (act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 100.85 us = 0.04% latency, 444.16 GFLOPS)
+      )
+    )
+    (2): DiTLayer(
+      285.41 M = 3.1% Params, 775.38 GMACs = 3.11% MACs, 7.22 ms = 2.93% latency, 214.76 TFLOPS
+      (input_layernorm): AdaLayerNormZero(
+        100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 947.48 us = 0.39% latency, 3.4 TFLOPS
+        (silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 37.43 us = 0.02% latency, 1.75 GFLOPS)
+        (linear): Linear(100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 238.66 us = 0.1% latency, 13.5 TFLOPS, in_features=4096, out_features=24576, bias=True)
+        (norm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 462.06 us = 0.19% latency, 0 FLOPS)
+      )
+      (self_attn): DiTSelfAttention(
+        50.33 M = 0.55% Params, 223.34 GMACs = 0.9% MACs, 3.02 ms = 1.23% latency, 148.02 TFLOPS
+        (q_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 466.11 us = 0.19% latency, 0 FLOPS)
+        (k_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 234.6 us = 0.1% latency, 0 FLOPS)
+        (q_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 349.04 us = 0.14% latency, 393.76 TFLOPS, in_features=4096, out_features=4096, bias=False)
+        (k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 198.13 us = 0.08% latency, 173.42 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 194.79 us = 0.08% latency, 176.4 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (text_k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 159.74 us = 0.06% latency, 215.1 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (text_v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 154.5 us = 0.06% latency, 222.4 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (o_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 339.98 us = 0.14% latency, 404.25 TFLOPS, in_features=4096, out_features=4096, bias=False)
+      )
+      (post_attention_layernorm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 458.24 us = 0.19% latency, 0 FLOPS)
+      (mlp): GemmaMLP(
+        134.38 M = 1.46% Params, 550.43 GMACs = 2.21% MACs, 2.15 ms = 0.87% latency, 513.11 TFLOPS
+        (gate_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 633.96 us = 0.26% latency, 578.83 TFLOPS, in_features=4096, out_features=10936, bias=False)
+        (up_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 620.84 us = 0.25% latency, 591.05 TFLOPS, in_features=4096, out_features=10936, bias=False)
+        (down_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 587.46 us = 0.24% latency, 624.64 TFLOPS, in_features=10936, out_features=4096, bias=False)
+        (act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 100.61 us = 0.04% latency, 445.21 GFLOPS)
+      )
+    )
+    (3): DiTLayer(
+      285.41 M = 3.1% Params, 775.38 GMACs = 3.11% MACs, 7.25 ms = 2.94% latency, 214.03 TFLOPS
+      (input_layernorm): AdaLayerNormZero(
+        100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 948.19 us = 0.39% latency, 3.4 TFLOPS
+        (silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 38.15 us = 0.02% latency, 1.72 GFLOPS)
+        (linear): Linear(100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 239.13 us = 0.1% latency, 13.47 TFLOPS, in_features=4096, out_features=24576, bias=True)
+        (norm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 462.06 us = 0.19% latency, 0 FLOPS)
+      )
+      (self_attn): DiTSelfAttention(
+        50.33 M = 0.55% Params, 223.34 GMACs = 0.9% MACs, 3.04 ms = 1.24% latency, 146.93 TFLOPS
+        (q_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 464.68 us = 0.19% latency, 0 FLOPS)
+        (k_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 238.9 us = 0.1% latency, 0 FLOPS)
+        (q_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 349.76 us = 0.14% latency, 392.95 TFLOPS, in_features=4096, out_features=4096, bias=False)
+        (k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 208.14 us = 0.08% latency, 165.08 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 195.5 us = 0.08% latency, 175.75 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (text_k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 159.98 us = 0.07% latency, 214.78 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (text_v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 156.16 us = 0.06% latency, 220.02 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (o_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 346.42 us = 0.14% latency, 396.74 TFLOPS, in_features=4096, out_features=4096, bias=False)
+      )
+      (post_attention_layernorm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 458.48 us = 0.19% latency, 0 FLOPS)
+      (mlp): GemmaMLP(
+        134.38 M = 1.46% Params, 550.43 GMACs = 2.21% MACs, 2.15 ms = 0.87% latency, 512.94 TFLOPS
+        (gate_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 627.52 us = 0.26% latency, 584.77 TFLOPS, in_features=4096, out_features=10936, bias=False)
+        (up_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 623.7 us = 0.25% latency, 588.34 TFLOPS, in_features=4096, out_features=10936, bias=False)
+        (down_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 590.09 us = 0.24% latency, 621.86 TFLOPS, in_features=10936, out_features=4096, bias=False)
+        (act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 101.57 us = 0.04% latency, 441.03 GFLOPS)
+      )
+    )
+    (4): DiTLayer(
+      285.41 M = 3.1% Params, 775.38 GMACs = 3.11% MACs, 7.24 ms = 2.94% latency, 214.17 TFLOPS
+      (input_layernorm): AdaLayerNormZero(
+        100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 959.87 us = 0.39% latency, 3.36 TFLOPS
+        (silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 41.72 us = 0.02% latency, 1.57 GFLOPS)
+        (linear): Linear(100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 243.66 us = 0.1% latency, 13.22 TFLOPS, in_features=4096, out_features=24576, bias=True)
+        (norm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 463.72 us = 0.19% latency, 0 FLOPS)
+      )
+      (self_attn): DiTSelfAttention(
+        50.33 M = 0.55% Params, 223.34 GMACs = 0.9% MACs, 3.03 ms = 1.23% latency, 147.57 TFLOPS
+        (q_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 465.63 us = 0.19% latency, 0 FLOPS)
+        (k_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 237.7 us = 0.1% latency, 0 FLOPS)
+        (q_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 349.76 us = 0.14% latency, 392.95 TFLOPS, in_features=4096, out_features=4096, bias=False)
+        (k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 198.6 us = 0.08% latency, 173.01 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 196.22 us = 0.08% latency, 175.11 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (text_k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 168.56 us = 0.07% latency, 203.84 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (text_v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 156.16 us = 0.06% latency, 220.02 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (o_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 334.26 us = 0.14% latency, 411.17 TFLOPS, in_features=4096, out_features=4096, bias=False)
+      )
+      (post_attention_layernorm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 459.19 us = 0.19% latency, 0 FLOPS)
+      (mlp): GemmaMLP(
+        134.38 M = 1.46% Params, 550.43 GMACs = 2.21% MACs, 2.14 ms = 0.87% latency, 513.91 TFLOPS
+        (gate_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 625.13 us = 0.25% latency, 587 TFLOPS, in_features=4096, out_features=10936, bias=False)
+        (up_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 623.94 us = 0.25% latency, 588.12 TFLOPS, in_features=4096, out_features=10936, bias=False)
+        (down_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 586.03 us = 0.24% latency, 626.16 TFLOPS, in_features=10936, out_features=4096, bias=False)
+        (act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 101.33 us = 0.04% latency, 442.07 GFLOPS)
+      )
+    )
+    (5): DiTLayer(
+      285.41 M = 3.1% Params, 775.38 GMACs = 3.11% MACs, 7.24 ms = 2.94% latency, 214.34 TFLOPS
+      (input_layernorm): AdaLayerNormZero(
+        100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 960.59 us = 0.39% latency, 3.35 TFLOPS
+        (silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 39.1 us = 0.02% latency, 1.68 GFLOPS)
+        (linear): Linear(100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 246.05 us = 0.1% latency, 13.09 TFLOPS, in_features=4096, out_features=24576, bias=True)
+        (norm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 463.72 us = 0.19% latency, 0 FLOPS)
+      )
+      (self_attn): DiTSelfAttention(
+        50.33 M = 0.55% Params, 223.34 GMACs = 0.9% MACs, 3.01 ms = 1.22% latency, 148.22 TFLOPS
+        (q_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 463.01 us = 0.19% latency, 0 FLOPS)
+        (k_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 236.75 us = 0.1% latency, 0 FLOPS)
+        (q_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 349.52 us = 0.14% latency, 393.22 TFLOPS, in_features=4096, out_features=4096, bias=False)
+        (k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 197.65 us = 0.08% latency, 173.84 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 195.5 us = 0.08% latency, 175.75 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (text_k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 158.55 us = 0.06% latency, 216.71 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (text_v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 155.45 us = 0.06% latency, 221.04 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (o_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 333.79 us = 0.14% latency, 411.76 TFLOPS, in_features=4096, out_features=4096, bias=False)
+      )
+      (post_attention_layernorm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 458 us = 0.19% latency, 0 FLOPS)
+      (mlp): GemmaMLP(
+        134.38 M = 1.46% Params, 550.43 GMACs = 2.21% MACs, 2.15 ms = 0.87% latency, 512.26 TFLOPS
+        (gate_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 626.8 us = 0.25% latency, 585.43 TFLOPS, in_features=4096, out_features=10936, bias=False)
+        (up_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 623.7 us = 0.25% latency, 588.34 TFLOPS, in_features=4096, out_features=10936, bias=False)
+        (down_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 586.99 us = 0.24% latency, 625.14 TFLOPS, in_features=10936, out_features=4096, bias=False)
+        (act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 103 us = 0.04% latency, 434.91 GFLOPS)
+      )
+    )
+    (6): DiTLayer(
+      285.41 M = 3.1% Params, 775.38 GMACs = 3.11% MACs, 7.25 ms = 2.95% latency, 213.89 TFLOPS
+      (input_layernorm): AdaLayerNormZero(
+        100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 994.21 us = 0.4% latency, 3.24 TFLOPS
+        (silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 40.77 us = 0.02% latency, 1.61 GFLOPS)
+        (linear): Linear(100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 237.46 us = 0.1% latency, 13.57 TFLOPS, in_features=4096, out_features=24576, bias=True)
+        (norm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 463.01 us = 0.19% latency, 0 FLOPS)
+      )
+      (self_attn): DiTSelfAttention(
+        50.33 M = 0.55% Params, 223.34 GMACs = 0.9% MACs, 3.01 ms = 1.22% latency, 148.35 TFLOPS
+        (q_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 464.2 us = 0.19% latency, 0 FLOPS)
+        (k_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 235.08 us = 0.1% latency, 0 FLOPS)
+        (q_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 347.85 us = 0.14% latency, 395.11 TFLOPS, in_features=4096, out_features=4096, bias=False)
+        (k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 196.7 us = 0.08% latency, 174.69 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 193.6 us = 0.08% latency, 177.48 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (text_k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 156.88 us = 0.06% latency, 219.02 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (text_v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 154.26 us = 0.06% latency, 222.74 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (o_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 340.22 us = 0.14% latency, 403.97 TFLOPS, in_features=4096, out_features=4096, bias=False)
+      )
+      (post_attention_layernorm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 458.72 us = 0.19% latency, 0 FLOPS)
+      (mlp): GemmaMLP(
+        134.38 M = 1.46% Params, 550.43 GMACs = 2.21% MACs, 2.14 ms = 0.87% latency, 515.29 TFLOPS
+        (gate_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 626.33 us = 0.25% latency, 585.88 TFLOPS, in_features=4096, out_features=10936, bias=False)
+        (up_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 622.27 us = 0.25% latency, 589.7 TFLOPS, in_features=4096, out_features=10936, bias=False)
+        (down_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 583.17 us = 0.24% latency, 629.23 TFLOPS, in_features=10936, out_features=4096, bias=False)
+        (act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 100.37 us = 0.04% latency, 446.27 GFLOPS)
+      )
+    )
+    (7): DiTLayer(
+      285.41 M = 3.1% Params, 775.38 GMACs = 3.11% MACs, 7.2 ms = 2.93% latency, 215.45 TFLOPS
+      (input_layernorm): AdaLayerNormZero(
+        100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 946.28 us = 0.38% latency, 3.4 TFLOPS
+        (silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 36 us = 0.01% latency, 1.82 GFLOPS)
+        (linear): Linear(100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 242.71 us = 0.1% latency, 13.27 TFLOPS, in_features=4096, out_features=24576, bias=True)
+        (norm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 461.1 us = 0.19% latency, 0 FLOPS)
+      )
+      (self_attn): DiTSelfAttention(
+        50.33 M = 0.55% Params, 223.34 GMACs = 0.9% MACs, 3.01 ms = 1.22% latency, 148.48 TFLOPS
+        (q_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 464.68 us = 0.19% latency, 0 FLOPS)
+        (k_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 235.08 us = 0.1% latency, 0 FLOPS)
+        (q_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 350 us = 0.14% latency, 392.68 TFLOPS, in_features=4096, out_features=4096, bias=False)
+        (k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 198.84 us = 0.08% latency, 172.8 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 194.79 us = 0.08% latency, 176.4 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (text_k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 155.45 us = 0.06% latency, 221.04 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (text_v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 153.54 us = 0.06% latency, 223.78 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (o_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 335.45 us = 0.14% latency, 409.71 TFLOPS, in_features=4096, out_features=4096, bias=False)
+      )
+      (post_attention_layernorm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 456.57 us = 0.19% latency, 0 FLOPS)
+      (mlp): GemmaMLP(
+        134.38 M = 1.46% Params, 550.43 GMACs = 2.21% MACs, 2.14 ms = 0.87% latency, 515.12 TFLOPS
+        (gate_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 627.28 us = 0.25% latency, 584.99 TFLOPS, in_features=4096, out_features=10936, bias=False)
+        (up_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 622.27 us = 0.25% latency, 589.7 TFLOPS, in_features=4096, out_features=10936, bias=False)
+        (down_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 583.17 us = 0.24% latency, 629.23 TFLOPS, in_features=10936, out_features=4096, bias=False)
+        (act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 100.14 us = 0.04% latency, 447.33 GFLOPS)
+      )
+    )
+    (8): DiTLayer(
+      285.41 M = 3.1% Params, 775.38 GMACs = 3.11% MACs, 7.27 ms = 2.95% latency, 213.4 TFLOPS
+      (input_layernorm): AdaLayerNormZero(
+        100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 960.35 us = 0.39% latency, 3.35 TFLOPS
+        (silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 43.39 us = 0.02% latency, 1.51 GFLOPS)
+        (linear): Linear(100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 238.18 us = 0.1% latency, 13.52 TFLOPS, in_features=4096, out_features=24576, bias=True)
+        (norm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 463.72 us = 0.19% latency, 0 FLOPS)
+      )
+      (self_attn): DiTSelfAttention(
+        50.33 M = 0.55% Params, 223.34 GMACs = 0.9% MACs, 3.01 ms = 1.23% latency, 148.17 TFLOPS
+        (q_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 462.53 us = 0.19% latency, 0 FLOPS)
+        (k_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 236.27 us = 0.1% latency, 0 FLOPS)
+        (q_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 348.33 us = 0.14% latency, 394.57 TFLOPS, in_features=4096, out_features=4096, bias=False)
+        (k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 198.36 us = 0.08% latency, 173.22 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 194.55 us = 0.08% latency, 176.61 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (text_k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 156.4 us = 0.06% latency, 219.69 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (text_v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 154.97 us = 0.06% latency, 221.72 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (o_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 341.89 us = 0.14% latency, 401.99 TFLOPS, in_features=4096, out_features=4096, bias=False)
+      )
+      (post_attention_layernorm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 458 us = 0.19% latency, 0 FLOPS)
+      (mlp): GemmaMLP(
+        134.38 M = 1.46% Params, 550.43 GMACs = 2.21% MACs, 2.18 ms = 0.89% latency, 504.81 TFLOPS
+        (gate_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 627.52 us = 0.26% latency, 584.77 TFLOPS, in_features=4096, out_features=10936, bias=False)
+        (up_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 624.18 us = 0.25% latency, 587.89 TFLOPS, in_features=4096, out_features=10936, bias=False)
+        (down_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 587.7 us = 0.24% latency, 624.38 TFLOPS, in_features=10936, out_features=4096, bias=False)
+        (act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 103.24 us = 0.04% latency, 433.9 GFLOPS)
+      )
+    )
+    (9): DiTLayer(
+      285.41 M = 3.1% Params, 775.38 GMACs = 3.11% MACs, 7.27 ms = 2.96% latency, 213.18 TFLOPS
+      (input_layernorm): AdaLayerNormZero(
+        100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 957.73 us = 0.39% latency, 3.36 TFLOPS
+        (silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 38.86 us = 0.02% latency, 1.69 GFLOPS)
+        (linear): Linear(100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 240.33 us = 0.1% latency, 13.4 TFLOPS, in_features=4096, out_features=24576, bias=True)
+        (norm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 463.72 us = 0.19% latency, 0 FLOPS)
+      )
+      (self_attn): DiTSelfAttention(
+        50.33 M = 0.55% Params, 223.34 GMACs = 0.9% MACs, 3.03 ms = 1.23% latency, 147.24 TFLOPS
+        (q_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 464.92 us = 0.19% latency, 0 FLOPS)
+        (k_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 236.03 us = 0.1% latency, 0 FLOPS)
+        (q_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 350 us = 0.14% latency, 392.68 TFLOPS, in_features=4096, out_features=4096, bias=False)
+        (k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 199.08 us = 0.08% latency, 172.59 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 194.79 us = 0.08% latency, 176.4 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (text_k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 157.59 us = 0.06% latency, 218.03 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (text_v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 153.78 us = 0.06% latency, 223.43 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (o_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 347.85 us = 0.14% latency, 395.11 TFLOPS, in_features=4096, out_features=4096, bias=False)
+      )
+      (post_attention_layernorm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 459.19 us = 0.19% latency, 0 FLOPS)
+      (mlp): GemmaMLP(
+        134.38 M = 1.46% Params, 550.43 GMACs = 2.21% MACs, 2.16 ms = 0.88% latency, 510.73 TFLOPS
+        (gate_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 635.15 us = 0.26% latency, 577.74 TFLOPS, in_features=4096, out_features=10936, bias=False)
+        (up_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 623.46 us = 0.25% latency, 588.57 TFLOPS, in_features=4096, out_features=10936, bias=False)
+        (down_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 587.46 us = 0.24% latency, 624.64 TFLOPS, in_features=10936, out_features=4096, bias=False)
+        (act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 102.76 us = 0.04% latency, 435.91 GFLOPS)
+      )
+    )
+    (10): DiTLayer(
+      285.41 M = 3.1% Params, 775.38 GMACs = 3.11% MACs, 7.27 ms = 2.96% latency, 213.26 TFLOPS
+      (input_layernorm): AdaLayerNormZero(
+        100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 949.38 us = 0.39% latency, 3.39 TFLOPS
+        (silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 36.72 us = 0.01% latency, 1.78 GFLOPS)
+        (linear): Linear(100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 241.28 us = 0.1% latency, 13.35 TFLOPS, in_features=4096, out_features=24576, bias=True)
+        (norm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 461.58 us = 0.19% latency, 0 FLOPS)
+      )
+      (self_attn): DiTSelfAttention(
+        50.33 M = 0.55% Params, 223.34 GMACs = 0.9% MACs, 3.03 ms = 1.23% latency, 147.18 TFLOPS
+        (q_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 466.11 us = 0.19% latency, 0 FLOPS)
+        (k_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 237.46 us = 0.1% latency, 0 FLOPS)
+        (q_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 349.28 us = 0.14% latency, 393.49 TFLOPS, in_features=4096, out_features=4096, bias=False)
+        (k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 198.13 us = 0.08% latency, 173.42 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 194.79 us = 0.08% latency, 176.4 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (text_k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 158.31 us = 0.06% latency, 217.04 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (text_v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 154.97 us = 0.06% latency, 221.72 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (o_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 347.14 us = 0.14% latency, 395.92 TFLOPS, in_features=4096, out_features=4096, bias=False)
+      )
+      (post_attention_layernorm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 462.06 us = 0.19% latency, 0 FLOPS)
+      (mlp): GemmaMLP(
+        134.38 M = 1.46% Params, 550.43 GMACs = 2.21% MACs, 2.17 ms = 0.88% latency, 508.48 TFLOPS
+        (gate_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 629.19 us = 0.26% latency, 583.22 TFLOPS, in_features=4096, out_features=10936, bias=False)
+        (up_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 623.94 us = 0.25% latency, 588.12 TFLOPS, in_features=4096, out_features=10936, bias=False)
+        (down_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 594.38 us = 0.24% latency, 617.37 TFLOPS, in_features=10936, out_features=4096, bias=False)
+        (act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 102.52 us = 0.04% latency, 436.93 GFLOPS)
+      )
+    )
+    (11): DiTLayer(
+      285.41 M = 3.1% Params, 775.38 GMACs = 3.11% MACs, 7.31 ms = 2.97% latency, 212.09 TFLOPS
+      (input_layernorm): AdaLayerNormZero(
+        100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 954.63 us = 0.39% latency, 3.37 TFLOPS
+        (silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 38.15 us = 0.02% latency, 1.72 GFLOPS)
+        (linear): Linear(100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 239.85 us = 0.1% latency, 13.43 TFLOPS, in_features=4096, out_features=24576, bias=True)
+        (norm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 462.06 us = 0.19% latency, 0 FLOPS)
+      )
+      (self_attn): DiTSelfAttention(
+        50.33 M = 0.55% Params, 223.34 GMACs = 0.9% MACs, 3.09 ms = 1.26% latency, 144.35 TFLOPS
+        (q_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 467.54 us = 0.19% latency, 0 FLOPS)
+        (k_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 236.75 us = 0.1% latency, 0 FLOPS)
+        (q_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 354.05 us = 0.14% latency, 388.19 TFLOPS, in_features=4096, out_features=4096, bias=False)
+        (k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 202.89 us = 0.08% latency, 169.35 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 197.89 us = 0.08% latency, 173.63 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (text_k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 160.46 us = 0.07% latency, 214.14 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (text_v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 160.46 us = 0.07% latency, 214.14 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (o_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 340.7 us = 0.14% latency, 403.4 TFLOPS, in_features=4096, out_features=4096, bias=False)
+      )
+      (post_attention_layernorm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 458.24 us = 0.19% latency, 0 FLOPS)
+      (mlp): GemmaMLP(
+        134.38 M = 1.46% Params, 550.43 GMACs = 2.21% MACs, 2.15 ms = 0.87% latency, 512.88 TFLOPS
+        (gate_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 626.09 us = 0.25% latency, 586.1 TFLOPS, in_features=4096, out_features=10936, bias=False)
+        (up_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 623.23 us = 0.25% latency, 588.79 TFLOPS, in_features=4096, out_features=10936, bias=False)
+        (down_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 588.66 us = 0.24% latency, 623.37 TFLOPS, in_features=10936, out_features=4096, bias=False)
+        (act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 101.33 us = 0.04% latency, 442.07 GFLOPS)
+      )
+    )
+    (12): DiTLayer(
+      285.41 M = 3.1% Params, 775.38 GMACs = 3.11% MACs, 7.21 ms = 2.93% latency, 215.14 TFLOPS
+      (input_layernorm): AdaLayerNormZero(
+        100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 944.38 us = 0.38% latency, 3.41 TFLOPS
+        (silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 36.95 us = 0.02% latency, 1.77 GFLOPS)
+        (linear): Linear(100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 235.8 us = 0.1% latency, 13.66 TFLOPS, in_features=4096, out_features=24576, bias=True)
+        (norm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 460.86 us = 0.19% latency, 0 FLOPS)
+      )
+      (self_attn): DiTSelfAttention(
+        50.33 M = 0.55% Params, 223.34 GMACs = 0.9% MACs, 3.01 ms = 1.22% latency, 148.27 TFLOPS
+        (q_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 464.44 us = 0.19% latency, 0 FLOPS)
+        (k_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 238.42 us = 0.1% latency, 0 FLOPS)
+        (q_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 347.38 us = 0.14% latency, 395.65 TFLOPS, in_features=4096, out_features=4096, bias=False)
+        (k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 198.13 us = 0.08% latency, 173.42 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 195.98 us = 0.08% latency, 175.32 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (text_k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 158.07 us = 0.06% latency, 217.37 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (text_v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 155.69 us = 0.06% latency, 220.7 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (o_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 337.84 us = 0.14% latency, 406.82 TFLOPS, in_features=4096, out_features=4096, bias=False)
+      )
+      (post_attention_layernorm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 458.48 us = 0.19% latency, 0 FLOPS)
+      (mlp): GemmaMLP(
+        134.38 M = 1.46% Params, 550.43 GMACs = 2.21% MACs, 2.14 ms = 0.87% latency, 514.26 TFLOPS
+        (gate_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 624.9 us = 0.25% latency, 587.22 TFLOPS, in_features=4096, out_features=10936, bias=False)
+        (up_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 622.75 us = 0.25% latency, 589.24 TFLOPS, in_features=4096, out_features=10936, bias=False)
+        (down_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 587.22 us = 0.24% latency, 624.89 TFLOPS, in_features=10936, out_features=4096, bias=False)
+        (act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 101.8 us = 0.04% latency, 440 GFLOPS)
+      )
+    )
+    (13): DiTLayer(
+      285.41 M = 3.1% Params, 775.38 GMACs = 3.11% MACs, 7.24 ms = 2.94% latency, 214.3 TFLOPS
+      (input_layernorm): AdaLayerNormZero(
+        100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 960.11 us = 0.39% latency, 3.36 TFLOPS
+        (silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 39.34 us = 0.02% latency, 1.67 GFLOPS)
+        (linear): Linear(100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 246.05 us = 0.1% latency, 13.09 TFLOPS, in_features=4096, out_features=24576, bias=True)
+        (norm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 464.2 us = 0.19% latency, 0 FLOPS)
+      )
+      (self_attn): DiTSelfAttention(
+        50.33 M = 0.55% Params, 223.34 GMACs = 0.9% MACs, 3.03 ms = 1.23% latency, 147.59 TFLOPS
+        (q_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 464.44 us = 0.19% latency, 0 FLOPS)
+        (k_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 236.27 us = 0.1% latency, 0 FLOPS)
+        (q_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 348.09 us = 0.14% latency, 394.84 TFLOPS, in_features=4096, out_features=4096, bias=False)
+        (k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 199.32 us = 0.08% latency, 172.39 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 195.26 us = 0.08% latency, 175.96 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (text_k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 168.56 us = 0.07% latency, 203.84 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (text_v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 155.69 us = 0.06% latency, 220.7 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (o_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 338.08 us = 0.14% latency, 406.53 TFLOPS, in_features=4096, out_features=4096, bias=False)
+      )
+      (post_attention_layernorm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 459.19 us = 0.19% latency, 0 FLOPS)
+      (mlp): GemmaMLP(
+        134.38 M = 1.46% Params, 550.43 GMACs = 2.21% MACs, 2.14 ms = 0.87% latency, 514.6 TFLOPS
+        (gate_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 626.33 us = 0.25% latency, 585.88 TFLOPS, in_features=4096, out_features=10936, bias=False)
+        (up_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 621.8 us = 0.25% latency, 590.15 TFLOPS, in_features=4096, out_features=10936, bias=False)
+        (down_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 587.94 us = 0.24% latency, 624.13 TFLOPS, in_features=10936, out_features=4096, bias=False)
+        (act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 101.57 us = 0.04% latency, 441.03 GFLOPS)
+      )
+    )
+    (14): DiTLayer(
+      285.41 M = 3.1% Params, 775.38 GMACs = 3.11% MACs, 7.21 ms = 2.93% latency, 214.95 TFLOPS
+      (input_layernorm): AdaLayerNormZero(
+        100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 953.2 us = 0.39% latency, 3.38 TFLOPS
+        (silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 39.58 us = 0.02% latency, 1.66 GFLOPS)
+        (linear): Linear(100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 242.23 us = 0.1% latency, 13.3 TFLOPS, in_features=4096, out_features=24576, bias=True)
+        (norm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 461.34 us = 0.19% latency, 0 FLOPS)
+      )
+      (self_attn): DiTSelfAttention(
+        50.33 M = 0.55% Params, 223.34 GMACs = 0.9% MACs, 3.01 ms = 1.22% latency, 148.21 TFLOPS
+        (q_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 463.96 us = 0.19% latency, 0 FLOPS)
+        (k_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 236.03 us = 0.1% latency, 0 FLOPS)
+        (q_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 348.81 us = 0.14% latency, 394.03 TFLOPS, in_features=4096, out_features=4096, bias=False)
+        (k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 197.65 us = 0.08% latency, 173.84 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 193.83 us = 0.08% latency, 177.26 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (text_k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 156.4 us = 0.06% latency, 219.69 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (text_v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 154.26 us = 0.06% latency, 222.74 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (o_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 339.98 us = 0.14% latency, 404.25 TFLOPS, in_features=4096, out_features=4096, bias=False)
+      )
+      (post_attention_layernorm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 458.48 us = 0.19% latency, 0 FLOPS)
+      (mlp): GemmaMLP(
+        134.38 M = 1.46% Params, 550.43 GMACs = 2.21% MACs, 2.14 ms = 0.87% latency, 515.29 TFLOPS
+        (gate_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 624.66 us = 0.25% latency, 587.44 TFLOPS, in_features=4096, out_features=10936, bias=False)
+        (up_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 621.56 us = 0.25% latency, 590.37 TFLOPS, in_features=4096, out_features=10936, bias=False)
+        (down_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 587.7 us = 0.24% latency, 624.38 TFLOPS, in_features=10936, out_features=4096, bias=False)
+        (act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 100.61 us = 0.04% latency, 445.21 GFLOPS)
+      )
+    )
+    (15): DiTLayer(
+      285.41 M = 3.1% Params, 775.38 GMACs = 3.11% MACs, 7.23 ms = 2.94% latency, 214.39 TFLOPS
+      (input_layernorm): AdaLayerNormZero(
+        100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 954.15 us = 0.39% latency, 3.38 TFLOPS
+        (silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 40.05 us = 0.02% latency, 1.64 GFLOPS)
+        (linear): Linear(100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 240.33 us = 0.1% latency, 13.4 TFLOPS, in_features=4096, out_features=24576, bias=True)
+        (norm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 464.92 us = 0.19% latency, 0 FLOPS)
+      )
+      (self_attn): DiTSelfAttention(
+        50.33 M = 0.55% Params, 223.34 GMACs = 0.9% MACs, 3.02 ms = 1.23% latency, 147.86 TFLOPS
+        (q_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 463.96 us = 0.19% latency, 0 FLOPS)
+        (k_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 236.03 us = 0.1% latency, 0 FLOPS)
+        (q_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 350.48 us = 0.14% latency, 392.15 TFLOPS, in_features=4096, out_features=4096, bias=False)
+        (k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 198.84 us = 0.08% latency, 172.8 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 195.5 us = 0.08% latency, 175.75 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (text_k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 158.79 us = 0.06% latency, 216.39 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (text_v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 154.26 us = 0.06% latency, 222.74 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (o_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 340.7 us = 0.14% latency, 403.4 TFLOPS, in_features=4096, out_features=4096, bias=False)
+      )
+      (post_attention_layernorm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 458.24 us = 0.19% latency, 0 FLOPS)
+      (mlp): GemmaMLP(
+        134.38 M = 1.46% Params, 550.43 GMACs = 2.21% MACs, 2.14 ms = 0.87% latency, 513.74 TFLOPS
+        (gate_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 628.47 us = 0.26% latency, 583.88 TFLOPS, in_features=4096, out_features=10936, bias=False)
+        (up_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 624.9 us = 0.25% latency, 587.22 TFLOPS, in_features=4096, out_features=10936, bias=False)
+        (down_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 586.03 us = 0.24% latency, 626.16 TFLOPS, in_features=10936, out_features=4096, bias=False)
+        (act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 101.09 us = 0.04% latency, 443.11 GFLOPS)
+      )
+    )
+    (16): DiTLayer(
+      285.41 M = 3.1% Params, 775.38 GMACs = 3.11% MACs, 7.28 ms = 2.96% latency, 212.96 TFLOPS
+      (input_layernorm): AdaLayerNormZero(
+        100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 967.03 us = 0.39% latency, 3.33 TFLOPS
+        (silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 48.16 us = 0.02% latency, 1.36 GFLOPS)
+        (linear): Linear(100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 244.86 us = 0.1% latency, 13.16 TFLOPS, in_features=4096, out_features=24576, bias=True)
+        (norm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 463.96 us = 0.19% latency, 0 FLOPS)
+      )
+      (self_attn): DiTSelfAttention(
+        50.33 M = 0.55% Params, 223.34 GMACs = 0.9% MACs, 3.04 ms = 1.23% latency, 147.06 TFLOPS
+        (q_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 465.39 us = 0.19% latency, 0 FLOPS)
+        (k_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 236.51 us = 0.1% latency, 0 FLOPS)
+        (q_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 351.67 us = 0.14% latency, 390.82 TFLOPS, in_features=4096, out_features=4096, bias=False)
+        (k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 197.65 us = 0.08% latency, 173.84 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 195.5 us = 0.08% latency, 175.75 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (text_k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 158.55 us = 0.06% latency, 216.71 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (text_v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 154.26 us = 0.06% latency, 222.74 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (o_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 344.28 us = 0.14% latency, 399.21 TFLOPS, in_features=4096, out_features=4096, bias=False)
+      )
+      (post_attention_layernorm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 457.53 us = 0.19% latency, 0 FLOPS)
+      (mlp): GemmaMLP(
+        134.38 M = 1.46% Params, 550.43 GMACs = 2.21% MACs, 2.16 ms = 0.88% latency, 509.04 TFLOPS
+        (gate_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 631.81 us = 0.26% latency, 580.79 TFLOPS, in_features=4096, out_features=10936, bias=False)
+        (up_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 625.85 us = 0.25% latency, 586.33 TFLOPS, in_features=4096, out_features=10936, bias=False)
+        (down_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 596.76 us = 0.24% latency, 614.9 TFLOPS, in_features=10936, out_features=4096, bias=False)
+        (act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 101.33 us = 0.04% latency, 442.07 GFLOPS)
+      )
+    )
+    (17): DiTLayer(
+      285.41 M = 3.1% Params, 775.38 GMACs = 3.11% MACs, 7.22 ms = 2.94% latency, 214.68 TFLOPS
+      (input_layernorm): AdaLayerNormZero(
+        100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 951.29 us = 0.39% latency, 3.39 TFLOPS
+        (silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 38.62 us = 0.02% latency, 1.7 GFLOPS)
+        (linear): Linear(100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 240.8 us = 0.1% latency, 13.38 TFLOPS, in_features=4096, out_features=24576, bias=True)
+        (norm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 461.58 us = 0.19% latency, 0 FLOPS)
+      )
+      (self_attn): DiTSelfAttention(
+        50.33 M = 0.55% Params, 223.34 GMACs = 0.9% MACs, 3.01 ms = 1.22% latency, 148.22 TFLOPS
+        (q_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 465.39 us = 0.19% latency, 0 FLOPS)
+        (k_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 235.32 us = 0.1% latency, 0 FLOPS)
+        (q_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 348.09 us = 0.14% latency, 394.84 TFLOPS, in_features=4096, out_features=4096, bias=False)
+        (k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 196.93 us = 0.08% latency, 174.47 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 194.55 us = 0.08% latency, 176.61 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (text_k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 156.4 us = 0.06% latency, 219.69 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (text_v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 155.93 us = 0.06% latency, 220.36 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (o_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 342.61 us = 0.14% latency, 401.16 TFLOPS, in_features=4096, out_features=4096, bias=False)
+      )
+      (post_attention_layernorm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 458.24 us = 0.19% latency, 0 FLOPS)
+      (mlp): GemmaMLP(
+        134.38 M = 1.46% Params, 550.43 GMACs = 2.21% MACs, 2.15 ms = 0.87% latency, 513 TFLOPS
+        (gate_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 629.43 us = 0.26% latency, 582.99 TFLOPS, in_features=4096, out_features=10936, bias=False)
+        (up_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 625.37 us = 0.25% latency, 586.77 TFLOPS, in_features=4096, out_features=10936, bias=False)
+        (down_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 585.08 us = 0.24% latency, 627.18 TFLOPS, in_features=10936, out_features=4096, bias=False)
+        (act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 101.09 us = 0.04% latency, 443.11 GFLOPS)
+      )
+    )
+    (18): DiTLayer(
+      285.41 M = 3.1% Params, 775.38 GMACs = 3.11% MACs, 7.2 ms = 2.93% latency, 215.25 TFLOPS
+      (input_layernorm): AdaLayerNormZero(
+        100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 938.89 us = 0.38% latency, 3.43 TFLOPS
+        (silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 37.19 us = 0.02% latency, 1.76 GFLOPS)
+        (linear): Linear(100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 234.6 us = 0.1% latency, 13.73 TFLOPS, in_features=4096, out_features=24576, bias=True)
+        (norm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 461.1 us = 0.19% latency, 0 FLOPS)
+      )
+      (self_attn): DiTSelfAttention(
+        50.33 M = 0.55% Params, 223.34 GMACs = 0.9% MACs, 3.01 ms = 1.23% latency, 148.16 TFLOPS
+        (q_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 465.15 us = 0.19% latency, 0 FLOPS)
+        (k_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 236.27 us = 0.1% latency, 0 FLOPS)
+        (q_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 350.71 us = 0.14% latency, 391.88 TFLOPS, in_features=4096, out_features=4096, bias=False)
+        (k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 197.17 us = 0.08% latency, 174.26 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 195.03 us = 0.08% latency, 176.18 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (text_k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 155.93 us = 0.06% latency, 220.36 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (text_v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 154.26 us = 0.06% latency, 222.74 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (o_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 334.02 us = 0.14% latency, 411.46 TFLOPS, in_features=4096, out_features=4096, bias=False)
+      )
+      (post_attention_layernorm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 460.15 us = 0.19% latency, 0 FLOPS)
+      (mlp): GemmaMLP(
+        134.38 M = 1.46% Params, 550.43 GMACs = 2.21% MACs, 2.14 ms = 0.87% latency, 514.71 TFLOPS
+        (gate_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 627.28 us = 0.25% latency, 584.99 TFLOPS, in_features=4096, out_features=10936, bias=False)
+        (up_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 623.94 us = 0.25% latency, 588.12 TFLOPS, in_features=4096, out_features=10936, bias=False)
+        (down_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 584.6 us = 0.24% latency, 627.69 TFLOPS, in_features=10936, out_features=4096, bias=False)
+        (act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 101.09 us = 0.04% latency, 443.11 GFLOPS)
+      )
+    )
+    (19): DiTLayer(
+      285.41 M = 3.1% Params, 775.38 GMACs = 3.11% MACs, 7.41 ms = 3.01% latency, 209.34 TFLOPS
+      (input_layernorm): AdaLayerNormZero(
+        100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 945.33 us = 0.38% latency, 3.41 TFLOPS
+        (silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 37.67 us = 0.02% latency, 1.74 GFLOPS)
+        (linear): Linear(100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 237.7 us = 0.1% latency, 13.55 TFLOPS, in_features=4096, out_features=24576, bias=True)
+        (norm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 462.53 us = 0.19% latency, 0 FLOPS)
+      )
+      (self_attn): DiTSelfAttention(
+        50.33 M = 0.55% Params, 223.34 GMACs = 0.9% MACs, 3.01 ms = 1.23% latency, 148.17 TFLOPS
+        (q_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 464.44 us = 0.19% latency, 0 FLOPS)
+        (k_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 233.89 us = 0.1% latency, 0 FLOPS)
+        (q_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 350 us = 0.14% latency, 392.68 TFLOPS, in_features=4096, out_features=4096, bias=False)
+        (k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 198.6 us = 0.08% latency, 173.01 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 196.22 us = 0.08% latency, 175.11 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (text_k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 158.07 us = 0.06% latency, 217.37 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (text_v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 154.97 us = 0.06% latency, 221.72 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (o_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 333.07 us = 0.14% latency, 412.64 TFLOPS, in_features=4096, out_features=4096, bias=False)
+      )
+      (post_attention_layernorm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 458.24 us = 0.19% latency, 0 FLOPS)
+      (mlp): GemmaMLP(
+        134.38 M = 1.46% Params, 550.43 GMACs = 2.21% MACs, 2.33 ms = 0.95% latency, 471.85 TFLOPS
+        (gate_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 627.76 us = 0.26% latency, 584.54 TFLOPS, in_features=4096, out_features=10936, bias=False)
+        (up_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 632.76 us = 0.26% latency, 579.92 TFLOPS, in_features=4096, out_features=10936, bias=False)
+        (down_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 596.28 us = 0.24% latency, 615.4 TFLOPS, in_features=10936, out_features=4096, bias=False)
+        (act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 235.08 us = 0.1% latency, 190.55 GFLOPS)
+      )
+    )
+    (20): DiTLayer(
+      285.41 M = 3.1% Params, 775.38 GMACs = 3.11% MACs, 7.26 ms = 2.95% latency, 213.63 TFLOPS
+      (input_layernorm): AdaLayerNormZero(
+        100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 948.19 us = 0.39% latency, 3.4 TFLOPS
+        (silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 37.19 us = 0.02% latency, 1.76 GFLOPS)
+        (linear): Linear(100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 234.84 us = 0.1% latency, 13.72 TFLOPS, in_features=4096, out_features=24576, bias=True)
+        (norm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 463.96 us = 0.19% latency, 0 FLOPS)
+      )
+      (self_attn): DiTSelfAttention(
+        50.33 M = 0.55% Params, 223.34 GMACs = 0.9% MACs, 3.05 ms = 1.24% latency, 146.58 TFLOPS
+        (q_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 468.02 us = 0.19% latency, 0 FLOPS)
+        (k_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 240.33 us = 0.1% latency, 0 FLOPS)
+        (q_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 349.76 us = 0.14% latency, 392.95 TFLOPS, in_features=4096, out_features=4096, bias=False)
+        (k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 198.36 us = 0.08% latency, 173.22 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 196.93 us = 0.08% latency, 174.47 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (text_k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 158.31 us = 0.06% latency, 217.04 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (text_v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 154.97 us = 0.06% latency, 221.72 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (o_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 344.04 us = 0.14% latency, 399.49 TFLOPS, in_features=4096, out_features=4096, bias=False)
+      )
+      (post_attention_layernorm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 458.48 us = 0.19% latency, 0 FLOPS)
+      (mlp): GemmaMLP(
+        134.38 M = 1.46% Params, 550.43 GMACs = 2.21% MACs, 2.15 ms = 0.87% latency, 512.71 TFLOPS
+        (gate_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 628.47 us = 0.26% latency, 583.88 TFLOPS, in_features=4096, out_features=10936, bias=False)
+        (up_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 624.42 us = 0.25% latency, 587.67 TFLOPS, in_features=4096, out_features=10936, bias=False)
+        (down_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 589.61 us = 0.24% latency, 622.36 TFLOPS, in_features=10936, out_features=4096, bias=False)
+        (act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 101.57 us = 0.04% latency, 441.03 GFLOPS)
+      )
+    )
+    (21): DiTLayer(
+      285.41 M = 3.1% Params, 775.38 GMACs = 3.11% MACs, 7.23 ms = 2.94% latency, 214.63 TFLOPS
+      (input_layernorm): AdaLayerNormZero(
+        100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 953.91 us = 0.39% latency, 3.38 TFLOPS
+        (silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 41.48 us = 0.02% latency, 1.58 GFLOPS)
+        (linear): Linear(100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 238.42 us = 0.1% latency, 13.51 TFLOPS, in_features=4096, out_features=24576, bias=True)
+        (norm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 465.87 us = 0.19% latency, 0 FLOPS)
+      )
+      (self_attn): DiTSelfAttention(
+        50.33 M = 0.55% Params, 223.34 GMACs = 0.9% MACs, 3.02 ms = 1.23% latency, 147.75 TFLOPS
+        (q_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 465.87 us = 0.19% latency, 0 FLOPS)
+        (k_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 236.51 us = 0.1% latency, 0 FLOPS)
+        (q_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 349.52 us = 0.14% latency, 393.22 TFLOPS, in_features=4096, out_features=4096, bias=False)
+        (k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 197.65 us = 0.08% latency, 173.84 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 194.55 us = 0.08% latency, 176.61 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (text_k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 158.07 us = 0.06% latency, 217.37 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (text_v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 155.21 us = 0.06% latency, 221.38 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (o_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 338.55 us = 0.14% latency, 405.96 TFLOPS, in_features=4096, out_features=4096, bias=False)
+      )
+      (post_attention_layernorm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 457.53 us = 0.19% latency, 0 FLOPS)
+      (mlp): GemmaMLP(
+        134.38 M = 1.46% Params, 550.43 GMACs = 2.21% MACs, 2.14 ms = 0.87% latency, 514.54 TFLOPS
+        (gate_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 626.8 us = 0.25% latency, 585.43 TFLOPS, in_features=4096, out_features=10936, bias=False)
+        (up_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 623.46 us = 0.25% latency, 588.57 TFLOPS, in_features=4096, out_features=10936, bias=False)
+        (down_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 585.56 us = 0.24% latency, 626.67 TFLOPS, in_features=10936, out_features=4096, bias=False)
+        (act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 100.85 us = 0.04% latency, 444.16 GFLOPS)
+      )
+    )
+    (22): DiTLayer(
+      285.41 M = 3.1% Params, 775.38 GMACs = 3.11% MACs, 7.2 ms = 2.93% latency, 215.32 TFLOPS
+      (input_layernorm): AdaLayerNormZero(
+        100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 940.08 us = 0.38% latency, 3.43 TFLOPS
+        (silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 36.95 us = 0.02% latency, 1.77 GFLOPS)
+        (linear): Linear(100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 237.46 us = 0.1% latency, 13.57 TFLOPS, in_features=4096, out_features=24576, bias=True)
+        (norm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 459.19 us = 0.19% latency, 0 FLOPS)
+      )
+      (self_attn): DiTSelfAttention(
+        50.33 M = 0.55% Params, 223.34 GMACs = 0.9% MACs, 3.02 ms = 1.23% latency, 148.01 TFLOPS
+        (q_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 463.01 us = 0.19% latency, 0 FLOPS)
+        (k_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 236.03 us = 0.1% latency, 0 FLOPS)
+        (q_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 348.57 us = 0.14% latency, 394.3 TFLOPS, in_features=4096, out_features=4096, bias=False)
+        (k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 198.13 us = 0.08% latency, 173.42 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 194.55 us = 0.08% latency, 176.61 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (text_k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 158.31 us = 0.06% latency, 217.04 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (text_v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 152.35 us = 0.06% latency, 225.53 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (o_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 336.89 us = 0.14% latency, 407.97 TFLOPS, in_features=4096, out_features=4096, bias=False)
+      )
+      (post_attention_layernorm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 457.05 us = 0.19% latency, 0 FLOPS)
+      (mlp): GemmaMLP(
+        134.38 M = 1.46% Params, 550.43 GMACs = 2.21% MACs, 2.14 ms = 0.87% latency, 515 TFLOPS
+        (gate_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 627.28 us = 0.25% latency, 584.99 TFLOPS, in_features=4096, out_features=10936, bias=False)
+        (up_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 624.66 us = 0.25% latency, 587.44 TFLOPS, in_features=4096, out_features=10936, bias=False)
+        (down_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 584.6 us = 0.24% latency, 627.69 TFLOPS, in_features=10936, out_features=4096, bias=False)
+        (act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 99.9 us = 0.04% latency, 448.4 GFLOPS)
+      )
+    )
+    (23): DiTLayer(
+      285.41 M = 3.1% Params, 775.38 GMACs = 3.11% MACs, 7.2 ms = 2.93% latency, 215.41 TFLOPS
+      (input_layernorm): AdaLayerNormZero(
+        100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 944.85 us = 0.38% latency, 3.41 TFLOPS
+        (silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 36.95 us = 0.02% latency, 1.77 GFLOPS)
+        (linear): Linear(100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 237.94 us = 0.1% latency, 13.54 TFLOPS, in_features=4096, out_features=24576, bias=True)
+        (norm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 462.29 us = 0.19% latency, 0 FLOPS)
+      )
+      (self_attn): DiTSelfAttention(
+        50.33 M = 0.55% Params, 223.34 GMACs = 0.9% MACs, 3.02 ms = 1.23% latency, 148.04 TFLOPS
+        (q_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 465.39 us = 0.19% latency, 0 FLOPS)
+        (k_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 237.7 us = 0.1% latency, 0 FLOPS)
+        (q_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 349.04 us = 0.14% latency, 393.76 TFLOPS, in_features=4096, out_features=4096, bias=False)
+        (k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 199.08 us = 0.08% latency, 172.59 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 195.03 us = 0.08% latency, 176.18 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (text_k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 158.79 us = 0.06% latency, 216.39 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (text_v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 156.4 us = 0.06% latency, 219.69 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (o_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 334.02 us = 0.14% latency, 411.46 TFLOPS, in_features=4096, out_features=4096, bias=False)
+      )
+      (post_attention_layernorm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 456.81 us = 0.19% latency, 0 FLOPS)
+      (mlp): GemmaMLP(
+        134.38 M = 1.46% Params, 550.43 GMACs = 2.21% MACs, 2.13 ms = 0.87% latency, 516.21 TFLOPS
+        (gate_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 627.28 us = 0.25% latency, 584.99 TFLOPS, in_features=4096, out_features=10936, bias=False)
+        (up_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 623.94 us = 0.25% latency, 588.12 TFLOPS, in_features=4096, out_features=10936, bias=False)
+        (down_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 581.98 us = 0.24% latency, 630.52 TFLOPS, in_features=10936, out_features=4096, bias=False)
+        (act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 100.61 us = 0.04% latency, 445.21 GFLOPS)
+      )
+    )
+    (24): DiTLayer(
+      285.41 M = 3.1% Params, 775.38 GMACs = 3.11% MACs, 7.23 ms = 2.94% latency, 214.37 TFLOPS
+      (input_layernorm): AdaLayerNormZero(
+        100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 962.5 us = 0.39% latency, 3.35 TFLOPS
+        (silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 39.34 us = 0.02% latency, 1.67 GFLOPS)
+        (linear): Linear(100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 241.99 us = 0.1% latency, 13.31 TFLOPS, in_features=4096, out_features=24576, bias=True)
+        (norm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 462.53 us = 0.19% latency, 0 FLOPS)
+      )
+      (self_attn): DiTSelfAttention(
+        50.33 M = 0.55% Params, 223.34 GMACs = 0.9% MACs, 3.02 ms = 1.23% latency, 147.81 TFLOPS
+        (q_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 466.35 us = 0.19% latency, 0 FLOPS)
+        (k_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 239.13 us = 0.1% latency, 0 FLOPS)
+        (q_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 350 us = 0.14% latency, 392.68 TFLOPS, in_features=4096, out_features=4096, bias=False)
+        (k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 199.08 us = 0.08% latency, 172.59 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 195.98 us = 0.08% latency, 175.32 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (text_k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 157.12 us = 0.06% latency, 218.69 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (text_v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 154.5 us = 0.06% latency, 222.4 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (o_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 334.02 us = 0.14% latency, 411.46 TFLOPS, in_features=4096, out_features=4096, bias=False)
+      )
+      (post_attention_layernorm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 457.76 us = 0.19% latency, 0 FLOPS)
+      (mlp): GemmaMLP(
+        134.38 M = 1.46% Params, 550.43 GMACs = 2.21% MACs, 2.14 ms = 0.87% latency, 514.43 TFLOPS
+        (gate_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 627.52 us = 0.26% latency, 584.77 TFLOPS, in_features=4096, out_features=10936, bias=False)
+        (up_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 623.94 us = 0.25% latency, 588.12 TFLOPS, in_features=4096, out_features=10936, bias=False)
+        (down_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 584.6 us = 0.24% latency, 627.69 TFLOPS, in_features=10936, out_features=4096, bias=False)
+        (act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 101.09 us = 0.04% latency, 443.11 GFLOPS)
+      )
+    )
+    (25): DiTLayer(
+      285.41 M = 3.1% Params, 775.38 GMACs = 3.11% MACs, 7.22 ms = 2.93% latency, 214.88 TFLOPS
+      (input_layernorm): AdaLayerNormZero(
+        100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 940.32 us = 0.38% latency, 3.43 TFLOPS
+        (silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 36.48 us = 0.01% latency, 1.8 GFLOPS)
+        (linear): Linear(100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 233.17 us = 0.09% latency, 13.81 TFLOPS, in_features=4096, out_features=24576, bias=True)
+        (norm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 462.06 us = 0.19% latency, 0 FLOPS)
+      )
+      (self_attn): DiTSelfAttention(
+        50.33 M = 0.55% Params, 223.34 GMACs = 0.9% MACs, 3.02 ms = 1.23% latency, 147.96 TFLOPS
+        (q_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 464.44 us = 0.19% latency, 0 FLOPS)
+        (k_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 242.95 us = 0.1% latency, 0 FLOPS)
+        (q_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 349.04 us = 0.14% latency, 393.76 TFLOPS, in_features=4096, out_features=4096, bias=False)
+        (k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 199.08 us = 0.08% latency, 172.59 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 195.98 us = 0.08% latency, 175.32 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (text_k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 157.12 us = 0.06% latency, 218.69 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (text_v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 153.3 us = 0.06% latency, 224.13 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (o_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 335.93 us = 0.14% latency, 409.13 TFLOPS, in_features=4096, out_features=4096, bias=False)
+      )
+      (post_attention_layernorm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 457.53 us = 0.19% latency, 0 FLOPS)
+      (mlp): GemmaMLP(
+        134.38 M = 1.46% Params, 550.43 GMACs = 2.21% MACs, 2.15 ms = 0.87% latency, 512.03 TFLOPS
+        (gate_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 633.48 us = 0.26% latency, 579.26 TFLOPS, in_features=4096, out_features=10936, bias=False)
+        (up_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 626.09 us = 0.25% latency, 586.1 TFLOPS, in_features=4096, out_features=10936, bias=False)
+        (down_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 588.42 us = 0.24% latency, 623.62 TFLOPS, in_features=10936, out_features=4096, bias=False)
+        (act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 100.37 us = 0.04% latency, 446.27 GFLOPS)
+      )
+    )
+    (26): DiTLayer(
+      285.41 M = 3.1% Params, 775.38 GMACs = 3.11% MACs, 7.31 ms = 2.97% latency, 212.01 TFLOPS
+      (input_layernorm): AdaLayerNormZero(
+        100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 1.03 ms = 0.42% latency, 3.13 TFLOPS
+        (silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 36 us = 0.01% latency, 1.82 GFLOPS)
+        (linear): Linear(100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 235.8 us = 0.1% latency, 13.66 TFLOPS, in_features=4096, out_features=24576, bias=True)
+        (norm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 464.68 us = 0.19% latency, 0 FLOPS)
+      )
+      (self_attn): DiTSelfAttention(
+        50.33 M = 0.55% Params, 223.34 GMACs = 0.9% MACs, 3.03 ms = 1.23% latency, 147.59 TFLOPS
+        (q_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 468.02 us = 0.19% latency, 0 FLOPS)
+        (k_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 237.23 us = 0.1% latency, 0 FLOPS)
+        (q_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 360.01 us = 0.15% latency, 381.76 TFLOPS, in_features=4096, out_features=4096, bias=False)
+        (k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 197.89 us = 0.08% latency, 173.63 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 194.55 us = 0.08% latency, 176.61 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (text_k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 156.64 us = 0.06% latency, 219.35 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (text_v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 153.54 us = 0.06% latency, 223.78 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (o_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 335.93 us = 0.14% latency, 409.13 TFLOPS, in_features=4096, out_features=4096, bias=False)
+      )
+      (post_attention_layernorm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 460.15 us = 0.19% latency, 0 FLOPS)
+      (mlp): GemmaMLP(
+        134.38 M = 1.46% Params, 550.43 GMACs = 2.21% MACs, 2.15 ms = 0.87% latency, 512.43 TFLOPS
+        (gate_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 629.43 us = 0.26% latency, 582.99 TFLOPS, in_features=4096, out_features=10936, bias=False)
+        (up_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 626.33 us = 0.25% latency, 585.88 TFLOPS, in_features=4096, out_features=10936, bias=False)
+        (down_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 588.89 us = 0.24% latency, 623.12 TFLOPS, in_features=10936, out_features=4096, bias=False)
+        (act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 100.85 us = 0.04% latency, 444.16 GFLOPS)
+      )
+    )
+    (27): DiTLayer(
+      285.41 M = 3.1% Params, 775.38 GMACs = 3.11% MACs, 9.77 ms = 3.97% latency, 158.72 TFLOPS
+      (input_layernorm): AdaLayerNormZero(
+        100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 991.82 us = 0.4% latency, 3.25 TFLOPS
+        (silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 83.68 us = 0.03% latency, 783.13 MFLOPS)
+        (linear): Linear(100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 236.75 us = 0.1% latency, 13.61 TFLOPS, in_features=4096, out_features=24576, bias=True)
+        (norm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 463.49 us = 0.19% latency, 0 FLOPS)
+      )
+      (self_attn): DiTSelfAttention(
+        50.33 M = 0.55% Params, 223.34 GMACs = 0.9% MACs, 3.01 ms = 1.22% latency, 148.24 TFLOPS
+        (q_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 464.2 us = 0.19% latency, 0 FLOPS)
+        (k_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 238.42 us = 0.1% latency, 0 FLOPS)
+        (q_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 348.33 us = 0.14% latency, 394.57 TFLOPS, in_features=4096, out_features=4096, bias=False)
+        (k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 197.89 us = 0.08% latency, 173.63 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 195.26 us = 0.08% latency, 175.96 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (text_k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 156.64 us = 0.06% latency, 219.35 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (text_v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 154.26 us = 0.06% latency, 222.74 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (o_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 336.41 us = 0.14% latency, 408.55 TFLOPS, in_features=4096, out_features=4096, bias=False)
+      )
+      (post_attention_layernorm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 458.72 us = 0.19% latency, 0 FLOPS)
+      (mlp): GemmaMLP(
+        134.38 M = 1.46% Params, 550.43 GMACs = 2.21% MACs, 4.59 ms = 1.87% latency, 239.65 TFLOPS
+        (gate_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 629.66 us = 0.26% latency, 582.77 TFLOPS, in_features=4096, out_features=10936, bias=False)
+        (up_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 884.77 us = 0.36% latency, 414.74 TFLOPS, in_features=4096, out_features=10936, bias=False)
+        (down_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 599.62 us = 0.24% latency, 611.97 TFLOPS, in_features=10936, out_features=4096, bias=False)
+        (act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 1.97 ms = 0.8% latency, 22.75 GFLOPS)
+      )
+    )
+    (28): DiTLayer(
+      285.41 M = 3.1% Params, 775.38 GMACs = 3.11% MACs, 7.41 ms = 3.01% latency, 209.21 TFLOPS
+      (input_layernorm): AdaLayerNormZero(
+        100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 982.52 us = 0.4% latency, 3.28 TFLOPS
+        (silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 41.01 us = 0.02% latency, 1.6 GFLOPS)
+        (linear): Linear(100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 244.14 us = 0.1% latency, 13.19 TFLOPS, in_features=4096, out_features=24576, bias=True)
+        (norm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 469.21 us = 0.19% latency, 0 FLOPS)
+      )
+      (self_attn): DiTSelfAttention(
+        50.33 M = 0.55% Params, 223.34 GMACs = 0.9% MACs, 3.09 ms = 1.25% latency, 144.71 TFLOPS
+        (q_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 464.92 us = 0.19% latency, 0 FLOPS)
+        (k_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 260.35 us = 0.11% latency, 0 FLOPS)
+        (q_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 366.21 us = 0.15% latency, 375.3 TFLOPS, in_features=4096, out_features=4096, bias=False)
+        (k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 218.39 us = 0.09% latency, 157.33 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 196.7 us = 0.08% latency, 174.69 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (text_k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 156.4 us = 0.06% latency, 219.69 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (text_v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 153.06 us = 0.06% latency, 224.48 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (o_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 339.03 us = 0.14% latency, 405.39 TFLOPS, in_features=4096, out_features=4096, bias=False)
+      )
+      (post_attention_layernorm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 457.29 us = 0.19% latency, 0 FLOPS)
+      (mlp): GemmaMLP(
+        134.38 M = 1.46% Params, 550.43 GMACs = 2.21% MACs, 2.2 ms = 0.89% latency, 501.25 TFLOPS
+        (gate_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 640.87 us = 0.26% latency, 572.58 TFLOPS, in_features=4096, out_features=10936, bias=False)
+        (up_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 626.8 us = 0.25% latency, 585.43 TFLOPS, in_features=4096, out_features=10936, bias=False)
+        (down_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 588.66 us = 0.24% latency, 623.37 TFLOPS, in_features=10936, out_features=4096, bias=False)
+        (act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 114.92 us = 0.05% latency, 389.79 GFLOPS)
+      )
+    )
+    (29): DiTLayer(
+      285.41 M = 3.1% Params, 775.38 GMACs = 3.11% MACs, 7.29 ms = 2.96% latency, 212.8 TFLOPS
+      (input_layernorm): AdaLayerNormZero(
+        100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 957.73 us = 0.39% latency, 3.36 TFLOPS
+        (silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 40.05 us = 0.02% latency, 1.64 GFLOPS)
+        (linear): Linear(100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 244.14 us = 0.1% latency, 13.19 TFLOPS, in_features=4096, out_features=24576, bias=True)
+        (norm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 463.25 us = 0.19% latency, 0 FLOPS)
+      )
+      (self_attn): DiTSelfAttention(
+        50.33 M = 0.55% Params, 223.34 GMACs = 0.9% MACs, 3.05 ms = 1.24% latency, 146.32 TFLOPS
+        (q_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 463.49 us = 0.19% latency, 0 FLOPS)
+        (k_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 236.75 us = 0.1% latency, 0 FLOPS)
+        (q_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 351.19 us = 0.14% latency, 391.35 TFLOPS, in_features=4096, out_features=4096, bias=False)
+        (k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 199.32 us = 0.08% latency, 172.39 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 194.55 us = 0.08% latency, 176.61 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (text_k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 156.16 us = 0.06% latency, 220.02 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (text_v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 175.95 us = 0.07% latency, 195.28 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (o_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 355.96 us = 0.14% latency, 386.11 TFLOPS, in_features=4096, out_features=4096, bias=False)
+      )
+      (post_attention_layernorm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 457.76 us = 0.19% latency, 0 FLOPS)
+      (mlp): GemmaMLP(
+        134.38 M = 1.46% Params, 550.43 GMACs = 2.21% MACs, 2.16 ms = 0.88% latency, 509.32 TFLOPS
+        (gate_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 643.97 us = 0.26% latency, 569.83 TFLOPS, in_features=4096, out_features=10936, bias=False)
+        (up_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 625.37 us = 0.25% latency, 586.77 TFLOPS, in_features=4096, out_features=10936, bias=False)
+        (down_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 589.37 us = 0.24% latency, 622.62 TFLOPS, in_features=10936, out_features=4096, bias=False)
+        (act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 100.85 us = 0.04% latency, 444.16 GFLOPS)
+      )
+    )
+    (30): DiTLayer(
+      285.41 M = 3.1% Params, 775.38 GMACs = 3.11% MACs, 7.26 ms = 2.95% latency, 213.5 TFLOPS
+      (input_layernorm): AdaLayerNormZero(
+        100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 937.22 us = 0.38% latency, 3.44 TFLOPS
+        (silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 39.58 us = 0.02% latency, 1.66 GFLOPS)
+        (linear): Linear(100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 231.74 us = 0.09% latency, 13.9 TFLOPS, in_features=4096, out_features=24576, bias=True)
+        (norm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 459.91 us = 0.19% latency, 0 FLOPS)
+      )
+      (self_attn): DiTSelfAttention(
+        50.33 M = 0.55% Params, 223.34 GMACs = 0.9% MACs, 3.05 ms = 1.24% latency, 146.45 TFLOPS
+        (q_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 464.68 us = 0.19% latency, 0 FLOPS)
+        (k_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 238.18 us = 0.1% latency, 0 FLOPS)
+        (q_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 350 us = 0.14% latency, 392.68 TFLOPS, in_features=4096, out_features=4096, bias=False)
+        (k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 195.98 us = 0.08% latency, 175.32 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 214.82 us = 0.09% latency, 159.95 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (text_k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 157.83 us = 0.06% latency, 217.7 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (text_v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 168.56 us = 0.07% latency, 203.84 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (o_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 338.55 us = 0.14% latency, 405.96 TFLOPS, in_features=4096, out_features=4096, bias=False)
+      )
+      (post_attention_layernorm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 458.24 us = 0.19% latency, 0 FLOPS)
+      (mlp): GemmaMLP(
+        134.38 M = 1.46% Params, 550.43 GMACs = 2.21% MACs, 2.17 ms = 0.88% latency, 508.14 TFLOPS
+        (gate_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 649.45 us = 0.26% latency, 565.02 TFLOPS, in_features=4096, out_features=10936, bias=False)
+        (up_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 626.56 us = 0.25% latency, 585.66 TFLOPS, in_features=4096, out_features=10936, bias=False)
+        (down_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 587.7 us = 0.24% latency, 624.38 TFLOPS, in_features=10936, out_features=4096, bias=False)
+        (act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 101.57 us = 0.04% latency, 441.03 GFLOPS)
+      )
+    )
+    (31): DiTLayer(
+      285.41 M = 3.1% Params, 775.38 GMACs = 3.11% MACs, 7.25 ms = 2.95% latency, 213.94 TFLOPS
+      (input_layernorm): AdaLayerNormZero(
+        100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 940.56 us = 0.38% latency, 3.42 TFLOPS
+        (silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 37.19 us = 0.02% latency, 1.76 GFLOPS)
+        (linear): Linear(100.69 M = 1.09% Params, 1.61 GMACs = 0.01% MACs, 235.08 us = 0.1% latency, 13.7 TFLOPS, in_features=4096, out_features=24576, bias=True)
+        (norm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 462.29 us = 0.19% latency, 0 FLOPS)
+      )
+      (self_attn): DiTSelfAttention(
+        50.33 M = 0.55% Params, 223.34 GMACs = 0.9% MACs, 3.03 ms = 1.23% latency, 147.55 TFLOPS
+        (q_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 466.11 us = 0.19% latency, 0 FLOPS)
+        (k_norm): GemmaRMSNorm(128 = 0% Params, 0 MACs = 0% MACs, 235.32 us = 0.1% latency, 0 FLOPS)
+        (q_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 347.38 us = 0.14% latency, 395.65 TFLOPS, in_features=4096, out_features=4096, bias=False)
+        (k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 216.01 us = 0.09% latency, 159.07 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 194.31 us = 0.08% latency, 176.83 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (text_k_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 156.16 us = 0.06% latency, 220.02 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (text_v_proj): Linear(4.19 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 154.5 us = 0.06% latency, 222.4 TFLOPS, in_features=4096, out_features=1024, bias=False)
+        (o_proj): Linear(16.78 M = 0.18% Params, 68.72 GMACs = 0.28% MACs, 340.46 us = 0.14% latency, 403.68 TFLOPS, in_features=4096, out_features=4096, bias=False)
+      )
+      (post_attention_layernorm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 456.81 us = 0.19% latency, 0 FLOPS)
+      (mlp): GemmaMLP(
+        134.38 M = 1.46% Params, 550.43 GMACs = 2.21% MACs, 2.17 ms = 0.88% latency, 508.42 TFLOPS
+        (gate_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 641.82 us = 0.26% latency, 571.73 TFLOPS, in_features=4096, out_features=10936, bias=False)
+        (up_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 624.9 us = 0.25% latency, 587.22 TFLOPS, in_features=4096, out_features=10936, bias=False)
+        (down_proj): Linear(44.79 M = 0.49% Params, 183.48 GMACs = 0.74% MACs, 581.98 us = 0.24% latency, 630.52 TFLOPS, in_features=10936, out_features=4096, bias=False)
+        (act_fn): PytorchGELUTanh(0 = 0% Params, 0 MACs = 0% MACs, 100.85 us = 0.04% latency, 444.16 GFLOPS)
+      )
+    )
+  )
+  (patch_embed): PatchEmbed(
+    266.24 K = 0% Params, 1.07 GMACs = 0% MACs, 627.04 us = 0.25% latency, 3.45 TFLOPS
+    (proj): Conv2d(266.24 K = 0% Params, 1.07 GMACs = 0% MACs, 391.01 us = 0.16% latency, 5.54 TFLOPS, 16, 4096, kernel_size=(2, 2), stride=(2, 2))
+  )
+  (rotary_emb): GemmaRotaryEmbedding(0 = 0% Params, 0 MACs = 0% MACs, 0 s = 0% latency, 0 FLOPS)
+  (time_proj): Timesteps(0 = 0% Params, 0 MACs = 0% MACs, 261.78 us = 0.11% latency, 0 FLOPS)
+  (timestep_embedder): Sequential(
+    17.83 M = 0.19% Params, 285.21 MMACs = 0% MACs, 520.94 us = 0.21% latency, 1.1 TFLOPS
+    (0): Linear(1.05 M = 0.01% Params, 16.78 MMACs = 0% MACs, 221.73 us = 0.09% latency, 151.33 GFLOPS, in_features=256, out_features=4096, bias=True)
+    (1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 41.96 us = 0.02% latency, 1.56 GFLOPS)
+    (2): Linear(16.78 M = 0.18% Params, 268.44 MMACs = 0% MACs, 184.54 us = 0.07% latency, 2.91 TFLOPS, in_features=4096, out_features=4096, bias=True)
+  )
+  (context_embedder): Sequential(
+    8.39 M = 0.09% Params, 34.36 GMACs = 0.14% MACs, 499.01 us = 0.2% latency, 137.71 TFLOPS
+    (0): GemmaRMSNorm(2.05 K = 0% Params, 0 MACs = 0% MACs, 178.81 us = 0.07% latency, 0 FLOPS)
+    (1): Linear(8.39 M = 0.09% Params, 34.36 GMACs = 0.14% MACs, 267.27 us = 0.11% latency, 257.12 TFLOPS, in_features=2048, out_features=4096, bias=True)
+  )
+  (norm_out): AdaLayerNormOut(
+    33.57 M = 0.36% Params, 536.87 MMACs = 0% MACs, 921.01 us = 0.37% latency, 1.17 TFLOPS
+    (silu): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 51.5 us = 0.02% latency, 1.27 GFLOPS)
+    (linear): Linear(33.56 M = 0.36% Params, 536.87 MMACs = 0% MACs, 197.89 us = 0.08% latency, 5.43 TFLOPS, in_features=4096, out_features=8192, bias=True)
+    (norm): GemmaRMSNorm(4.1 K = 0% Params, 0 MACs = 0% MACs, 461.1 us = 0.19% latency, 0 FLOPS)
+  )
+  (proj_out): Linear(262.21 K = 0% Params, 1.07 GMACs = 0% MACs, 205.99 us = 0.08% latency, 10.42 TFLOPS, in_features=4096, out_features=64, bias=True)
+  (repa_projector): Sequential(
+    14.16 M = 0.15% Params, 57.98 GMACs = 0.23% MACs, 774.15 us = 0.31% latency, 149.82 TFLOPS
+    (0): Linear(8.39 M = 0.09% Params, 34.36 GMACs = 0.14% MACs, 276.33 us = 0.11% latency, 248.69 TFLOPS, in_features=4096, out_features=2048, bias=True)
+    (1): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 33.62 us = 0.01% latency, 249.53 GFLOPS)
+    (2): Linear(4.2 M = 0.05% Params, 17.18 GMACs = 0.07% MACs, 185.73 us = 0.08% latency, 185 TFLOPS, in_features=2048, out_features=2048, bias=True)
+    (3): SiLU(0 = 0% Params, 0 MACs = 0% MACs, 30.04 us = 0.01% latency, 279.24 GFLOPS)
+    (4): Linear(1.57 M = 0.02% Params, 6.44 GMACs = 0.03% MACs, 144.48 us = 0.06% latency, 89.18 TFLOPS, in_features=2048, out_features=768, bias=True)
+  )
+)
+------------------------------------------------------------------------------

latest ADDED Viewed

	@@ -0,0 +1 @@


1	+ 100000

zero_to_fp32.py ADDED Viewed

	@@ -0,0 +1,604 @@

+#!/usr/bin/env python
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+# DeepSpeed Team
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example: python zero_to_fp32.py . pytorch_model.bin
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+from collections import OrderedDict
+from dataclasses import dataclass
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+                                            FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+                                            FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+@dataclass
+class zero_model_state:
+    buffers: dict()
+    param_shapes: dict()
+    shared_params: list
+    ds_version: int
+    frozen_param_shapes: dict()
+    frozen_param_fragments: dict()
+debug = 0
+# load to cpu
+device = torch.device('cpu')
+def atoi(text):
+    return int(text) if text.isdigit() else text
+def natural_keys(text):
+    '''
+    alist.sort(key=natural_keys) sorts in human order
+    http://nedbatchelder.com/blog/200712/human_sorting.html
+    (See Toothy's implementation in the comments)
+    '''
+    return [atoi(c) for c in re.split(r'(\d+)', text)]
+def get_model_state_file(checkpoint_dir, zero_stage):
+    if not os.path.isdir(checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+    # there should be only one file
+    if zero_stage <= 2:
+        file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+    elif zero_stage == 3:
+        file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+    if not os.path.exists(file):
+        raise FileNotFoundError(f"can't find model states file at '{file}'")
+    return file
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+    # XXX: need to test that this simple glob rule works for multi-node setup too
+    ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+    if len(ckpt_files) == 0:
+        raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+    return ckpt_files
+def get_optim_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+def get_model_state_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+def parse_model_states(files):
+    zero_model_states = []
+    for file in files:
+        state_dict = torch.load(file, map_location=device)
+        if BUFFER_NAMES not in state_dict:
+            raise ValueError(f"{file} is not a model state checkpoint")
+        buffer_names = state_dict[BUFFER_NAMES]
+        if debug:
+            print("Found buffers:", buffer_names)
+        # recover just the buffers while restoring them to fp32 if they were saved in fp16
+        buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+        param_shapes = state_dict[PARAM_SHAPES]
+        # collect parameters that are included in param_shapes
+        param_names = []
+        for s in param_shapes:
+            for name in s.keys():
+                param_names.append(name)
+        # update with frozen parameters
+        frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+        if frozen_param_shapes is not None:
+            if debug:
+                print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+            param_names += list(frozen_param_shapes.keys())
+        # handle shared params
+        shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+        ds_version = state_dict.get(DS_VERSION, None)
+        frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+        z_model_state = zero_model_state(buffers=buffers,
+                                         param_shapes=param_shapes,
+                                         shared_params=shared_params,
+                                         ds_version=ds_version,
+                                         frozen_param_shapes=frozen_param_shapes,
+                                         frozen_param_fragments=frozen_param_fragments)
+        zero_model_states.append(z_model_state)
+    return zero_model_states
+def parse_optim_states(files, ds_checkpoint_dir):
+    total_files = len(files)
+    state_dicts = []
+    for f in files:
+        state_dict = torch.load(f, map_location=device)
+        # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+        # and also handle the case where it was already removed by another helper script
+        state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+        state_dicts.append(state_dict)
+    if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
+        raise ValueError(f"{files[0]} is not a zero checkpoint")
+    zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+    world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+    # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+    # parameters can be different from data parallelism for non-expert parameters. So we can just
+    # use the max of the partition_count to get the dp world_size.
+    if type(world_size) is list:
+        world_size = max(world_size)
+    if world_size != total_files:
+        raise ValueError(
+            f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+            "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+        )
+    # the groups are named differently in each stage
+    if zero_stage <= 2:
+        fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+    elif zero_stage == 3:
+        fp32_groups_key = FP32_FLAT_GROUPS
+    else:
+        raise ValueError(f"unknown zero stage {zero_stage}")
+    if zero_stage <= 2:
+        fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+    elif zero_stage == 3:
+        # if there is more than one param group, there will be multiple flattened tensors - one
+        # flattened tensor per group - for simplicity merge them into a single tensor
+        #
+        # XXX: could make the script more memory efficient for when there are multiple groups - it
+        # will require matching the sub-lists of param_shapes for each param group flattened tensor
+        fp32_flat_groups = [
+            torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts))
+        ]
+    return zero_stage, world_size, fp32_flat_groups
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
+    """
+    Returns fp32 state_dict reconstructed from ds checkpoint
+    Args:
+        - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+    """
+    print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+    optim_files = get_optim_files(ds_checkpoint_dir)
+    zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+    print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+    model_files = get_model_state_files(ds_checkpoint_dir)
+    zero_model_states = parse_model_states(model_files)
+    print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+    if zero_stage <= 2:
+        return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+    elif zero_stage == 3:
+        return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+    frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+    frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+    if debug:
+        num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+        print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+    total_params = 0
+    total_numel = 0
+    for name, shape in frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        state_dict[name] = frozen_param_fragments[name]
+        if debug:
+            print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+def _has_callable(obj, fn):
+    attr = getattr(obj, fn, None)
+    return callable(attr)
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    # Reconstruction protocol:
+    #
+    # XXX: document this
+    if debug:
+        for i in range(world_size):
+            for j in range(len(fp32_flat_groups[0])):
+                print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+    # XXX: memory usage doubles here (zero2)
+    num_param_groups = len(fp32_flat_groups[0])
+    merged_single_partition_of_fp32_groups = []
+    for i in range(num_param_groups):
+        merged_partitions = [sd[i] for sd in fp32_flat_groups]
+        full_single_fp32_vector = torch.cat(merged_partitions, 0)
+        merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+    avail_numel = sum(
+        [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+    if debug:
+        wanted_params = sum([len(shapes) for shapes in param_shapes])
+        wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+        # not asserting if there is a mismatch due to possible padding
+        print(f"Have {avail_numel} numels to process.")
+        print(f"Need {wanted_numel} numels in {wanted_params} params.")
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    total_numel = 0
+    total_params = 0
+    for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+        offset = 0
+        avail_numel = full_single_fp32_vector.numel()
+        for name, shape in shapes.items():
+            unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
+            total_numel += unpartitioned_numel
+            total_params += 1
+            if debug:
+                print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+            state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+            offset += unpartitioned_numel
+        # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+        # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+        # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+        # live optimizer object, so we are checking that the numbers are within the right range
+        align_to = 2 * world_size
+        def zero2_align(x):
+            return align_to * math.ceil(x / align_to)
+        if debug:
+            print(f"original offset={offset}, avail_numel={avail_numel}")
+        offset = zero2_align(offset)
+        avail_numel = zero2_align(avail_numel)
+        if debug:
+            print(f"aligned  offset={offset}, avail_numel={avail_numel}")
+        # Sanity check
+        if offset != avail_numel:
+            raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+    print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+    if not exclude_frozen_parameters:
+        _zero2_merge_frozen_params(state_dict, zero_model_states)
+    _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+    return state_dict
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+    remainder = unpartitioned_numel % world_size
+    padding_numel = (world_size - remainder) if remainder else 0
+    partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+    return partitioned_numel, padding_numel
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+    if debug:
+        for i in range(world_size):
+            num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+            print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+        frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+    total_params = 0
+    total_numel = 0
+    for name, shape in zero_model_states[0].frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+        state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+        if debug:
+            print(
+                f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    avail_numel = fp32_flat_groups[0].numel() * world_size
+    # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+    # param, re-consolidating each param, while dealing with padding if any
+    # merge list of dicts, preserving order
+    param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+    if debug:
+        for i in range(world_size):
+            print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+        wanted_params = len(param_shapes)
+        wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+        # not asserting if there is a mismatch due to possible padding
+        avail_numel = fp32_flat_groups[0].numel() * world_size
+        print(f"Trainable params: Have {avail_numel} numels to process.")
+        print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    offset = 0
+    total_numel = 0
+    total_params = 0
+    for name, shape in param_shapes.items():
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        total_params += 1
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+        if debug:
+            print(
+                f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+        # XXX: memory usage doubles here
+        state_dict[name] = torch.cat(
+            tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)),
+            0).narrow(0, 0, unpartitioned_numel).view(shape)
+        offset += partitioned_numel
+    offset *= world_size
+    # Sanity check
+    if offset != avail_numel:
+        raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+    print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+    if not exclude_frozen_parameters:
+        _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+    _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+    return state_dict
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+    ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+    via a model hub.
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+    Returns:
+        - pytorch ``state_dict``
+    Note: this approach may not work if your application doesn't have sufficient free CPU memory and
+    you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+    the checkpoint.
+    A typical usage might be ::
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        # do the training and checkpoint saving
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+        model = model.cpu() # move to cpu
+        model.load_state_dict(state_dict)
+        # submit to model hub or save the model to share with others
+    In this example the ``model`` will no longer be usable in the deepspeed context of the same
+    application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+    If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+    """
+    if tag is None:
+        latest_path = os.path.join(checkpoint_dir, 'latest')
+        if os.path.isfile(latest_path):
+            with open(latest_path, 'r') as fd:
+                tag = fd.read().strip()
+        else:
+            raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+    ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+    if not os.path.isdir(ds_checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+    return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+    loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+    """
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters)
+    print(f"Saving fp32 state dict to {output_file}")
+    torch.save(state_dict, output_file)
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+    """
+    1. Put the provided model to cpu
+    2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+    3. Load it into the provided model
+    Args:
+        - ``model``: the model object to update
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+    Returns:
+        - ``model`: modified model
+    Make sure you have plenty of CPU memory available before you call this function. If you don't
+    have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+    conveniently placed for you in the checkpoint folder.
+    A typical usage might be ::
+        from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+        model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+        # submit to model hub or save the model to share with others
+    Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+    of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+    """
+    logger.info(f"Extracting fp32 weights")
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+    logger.info(f"Overwriting model with fp32 weights")
+    model = model.cpu()
+    model.load_state_dict(state_dict, strict=False)
+    return model
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("checkpoint_dir",
+                        type=str,
+                        help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+    parser.add_argument(
+        "output_file",
+        type=str,
+        help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)")
+    parser.add_argument("-t",
+                        "--tag",
+                        type=str,
+                        default=None,
+                        help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+    parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
+    parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+    args = parser.parse_args()
+    debug = args.debug
+    convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
+                                               args.output_file,
+                                               tag=args.tag,
+                                               exclude_frozen_parameters=args.exclude_frozen_parameters)